bsd/dev/dtrace/dtrace.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Portions copyright (c) 2011, Joyent, Inc. All rights reserved.
  24  */
  25
  26 /*
  27  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  28  * Use is subject to license terms.
  29  */
  30
  31 /* #pragma ident        "@(#)dtrace.c   1.65    08/07/02 SMI" */
  32
  33 /*
  34  * DTrace - Dynamic Tracing for Solaris
  35  *
  36  * This is the implementation of the Solaris Dynamic Tracing framework
  37  * (DTrace).  The user-visible interface to DTrace is described at length in
  38  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
  39  * library, the in-kernel DTrace framework, and the DTrace providers are
  40  * described in the block comments in the <sys/dtrace.h> header file.  The
  41  * internal architecture of DTrace is described in the block comments in the
  42  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
  43  * implementation very much assume mastery of all of these sources; if one has
  44  * an unanswered question about the implementation, one should consult them
  45  * first.
  46  *
  47  * The functions here are ordered roughly as follows:
  48  *
  49  *   - Probe context functions
  50  *   - Probe hashing functions
  51  *   - Non-probe context utility functions
  52  *   - Matching functions
  53  *   - Provider-to-Framework API functions
  54  *   - Probe management functions
  55  *   - DIF object functions
  56  *   - Format functions
  57  *   - Predicate functions
  58  *   - ECB functions
  59  *   - Buffer functions
  60  *   - Enabling functions
  61  *   - DOF functions
  62  *   - Anonymous enabling functions
  63  *   - Consumer state functions
  64  *   - Helper functions
  65  *   - Hook functions
  66  *   - Driver cookbook functions
  67  *
  68  * Each group of functions begins with a block comment labelled the "DTrace
  69  * [Group] Functions", allowing one to find each block by searching forward
  70  * on capital-f functions.
  71  */
  72 #include <sys/errno.h>
  73 #include <sys/types.h>
  74 #include <sys/stat.h>
  75 #include <sys/conf.h>
  76 #include <sys/systm.h>
  77 #include <sys/dtrace_impl.h>
  78 #include <sys/param.h>
  79 #include <sys/proc_internal.h>
  80 #include <sys/ioctl.h>
  81 #include <sys/fcntl.h>
  82 #include <miscfs/devfs/devfs.h>
  83 #include <sys/malloc.h>
  84 #include <sys/kernel_types.h>
  85 #include <sys/proc_internal.h>
  86 #include <sys/uio_internal.h>
  87 #include <sys/kauth.h>
  88 #include <vm/pmap.h>
  89 #include <sys/user.h>
  90 #include <mach/exception_types.h>
  91 #include <sys/signalvar.h>
  92 #include <mach/task.h>
  93 #include <kern/zalloc.h>
  94 #include <kern/ast.h>
  95 #include <kern/task.h>
  96 #include <netinet/in.h>
  97
  98 #include <kern/cpu_data.h>
  99 extern uint32_t pmap_find_phys(void *, uint64_t);
 100 extern boolean_t pmap_valid_page(uint32_t);
 101 extern void OSKextRegisterKextsWithDTrace(void);
 102 extern kmod_info_t g_kernel_kmod_info;
 103
 104 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
 105 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
 106
 107 #define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
 108
 109 extern void dtrace_suspend(void);
 110 extern void dtrace_resume(void);
 111 extern void dtrace_init(void);
 112 extern void helper_init(void);
 113 extern void fasttrap_init(void);
 114 extern void dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
 115 extern void dtrace_lazy_dofs_destroy(proc_t *);
 116 extern void dtrace_postinit(void);
 117
 118 #include "../../../osfmk/chud/chud_dtrace.h"
 119
 120 extern kern_return_t chudxnu_dtrace_callback
 121         (uint64_t selector, uint64_t *args, uint32_t count);
 122
 123 /* Import this function to retrieve the physical memory. */
 124 extern int kernel_sysctlbyname(const char *name, void *oldp,
 125         size_t *oldlenp, void *newp, size_t newlen);
 126
 127 /*
 128  * DTrace Tunable Variables
 129  *
 130  * The following variables may be dynamically tuned by using sysctl(8), the
 131  * variables being stored in the kern.dtrace namespace.  For example:
 132  *      sysctl kern.dtrace.dof_maxsize = 1048575        # 1M
 133  *
 134  * In general, the only variables that one should be tuning this way are those
 135  * that affect system-wide DTrace behavior, and for which the default behavior
 136  * is undesirable.  Most of these variables are tunable on a per-consumer
 137  * basis using DTrace options, and need not be tuned on a system-wide basis.
 138  * When tuning these variables, avoid pathological values; while some attempt
 139  * is made to verify the integrity of these variables, they are not considered
 140  * part of the supported interface to DTrace, and they are therefore not
 141  * checked comprehensively.
 142  */
 143 uint64_t        dtrace_buffer_memory_maxsize = 0;               /* initialized in dtrace_init */
 144 uint64_t        dtrace_buffer_memory_inuse = 0;
 145 int             dtrace_destructive_disallow = 0;
 146 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 147 size_t          dtrace_difo_maxsize = (256 * 1024);
 148 dtrace_optval_t dtrace_dof_maxsize = (384 * 1024);
 149 size_t          dtrace_global_maxsize = (16 * 1024);
 150 size_t          dtrace_actions_max = (16 * 1024);
 151 size_t          dtrace_retain_max = 1024;
 152 dtrace_optval_t dtrace_helper_actions_max = 32;
 153 dtrace_optval_t dtrace_helper_providers_max = 64;
 154 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
 155 size_t          dtrace_strsize_default = 256;
 156 dtrace_optval_t dtrace_cleanrate_default = 990099000;           /* 1.1 hz */
 157 dtrace_optval_t dtrace_cleanrate_min = 20000000;                        /* 50 hz */
 158 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
 159 dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
 160 dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
 161 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
 162 dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
 163 dtrace_optval_t dtrace_nspec_default = 1;
 164 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
 165 dtrace_optval_t dtrace_stackframes_default = 20;
 166 dtrace_optval_t dtrace_ustackframes_default = 20;
 167 dtrace_optval_t dtrace_jstackframes_default = 50;
 168 dtrace_optval_t dtrace_jstackstrsize_default = 512;
 169 int             dtrace_msgdsize_max = 128;
 170 hrtime_t        dtrace_chill_max = 500 * (NANOSEC / MILLISEC);  /* 500 ms */
 171 hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
 172 int             dtrace_devdepth_max = 32;
 173 int             dtrace_err_verbose;
 174 int             dtrace_provide_private_probes = 0;
 175 hrtime_t        dtrace_deadman_interval = NANOSEC;
 176 hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 177 hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
 178
 179 /*
 180  * DTrace External Variables
 181  *
 182  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
 183  * available to DTrace consumers via the backtick (`) syntax.  One of these,
 184  * dtrace_zero, is made deliberately so:  it is provided as a source of
 185  * well-known, zero-filled memory.  While this variable is not documented,
 186  * it is used by some translators as an implementation detail.
 187  */
 188 const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
 189 unsigned int    dtrace_max_cpus = 0;            /* number of enabled cpus */
 190 /*
 191  * DTrace Internal Variables
 192  */
 193 static dev_info_t       *dtrace_devi;           /* device info */
 194 static vmem_t           *dtrace_arena;          /* probe ID arena */
 195 static vmem_t           *dtrace_minor;          /* minor number arena */
 196 static taskq_t          *dtrace_taskq;          /* task queue */
 197 static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
 198 static int              dtrace_nprobes;         /* number of probes */
 199 static dtrace_provider_t *dtrace_provider;      /* provider list */
 200 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 201 static int              dtrace_opens;           /* number of opens */
 202 static int              dtrace_helpers;         /* number of helpers */
 203 static void             *dtrace_softstate;      /* softstate pointer */
 204 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 205 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 206 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 207 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
 208 static int              dtrace_toxranges;       /* number of toxic ranges */
 209 static int              dtrace_toxranges_max;   /* size of toxic range array */
 210 static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
 211 static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
 212 static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
 213 static kthread_t        *dtrace_panicked;       /* panicking thread */
 214 static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
 215 static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
 216 static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
 217 static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
 218 static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
 219 static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
 220
 221 static int              dtrace_dof_mode;        /* See dtrace_impl.h for a description of Darwin's dof modes. */
 222
 223                         /*
 224                          * This does't quite fit as an internal variable, as it must be accessed in
 225                          * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
 226                          */
 227 int                     dtrace_kernel_symbol_mode;      /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
 228
 229
 230 /*
 231  * To save memory, some common memory allocations are given a
 232  * unique zone. For example, dtrace_probe_t is 72 bytes in size,
 233  * which means it would fall into the kalloc.128 bucket. With
 234  * 20k elements allocated, the space saved is substantial.
 235  */
 236
 237 struct zone *dtrace_probe_t_zone;
 238
 239 static int dtrace_module_unloaded(struct kmod_info *kmod);
 240
 241 /*
 242  * DTrace Locking
 243  * DTrace is protected by three (relatively coarse-grained) locks:
 244  *
 245  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
 246  *     including enabling state, probes, ECBs, consumer state, helper state,
 247  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
 248  *     probe context is lock-free -- synchronization is handled via the
 249  *     dtrace_sync() cross call mechanism.
 250  *
 251  * (2) dtrace_provider_lock is required when manipulating provider state, or
 252  *     when provider state must be held constant.
 253  *
 254  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
 255  *     when meta provider state must be held constant.
 256  *
 257  * The lock ordering between these three locks is dtrace_meta_lock before
 258  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
 259  * several places where dtrace_provider_lock is held by the framework as it
 260  * calls into the providers -- which then call back into the framework,
 261  * grabbing dtrace_lock.)
 262  *
 263  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
 264  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
 265  * role as a coarse-grained lock; it is acquired before both of these locks.
 266  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
 267  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
 268  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
 269  * acquired _between_ dtrace_provider_lock and dtrace_lock.
 270  */
 271
 272
 273 /*
 274  * APPLE NOTE:
 275  *
 276  * For porting purposes, all kmutex_t vars have been changed
 277  * to lck_mtx_t, which require explicit initialization.
 278  *
 279  * kmutex_t becomes lck_mtx_t
 280  * mutex_enter() becomes lck_mtx_lock()
 281  * mutex_exit() becomes lck_mtx_unlock()
 282  *
 283  * Lock asserts are changed like this:
 284  *
 285  * ASSERT(MUTEX_HELD(&cpu_lock));
 286  *      becomes:
 287  * lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
 288  *
 289  */
 290 static lck_mtx_t        dtrace_lock;            /* probe state lock */
 291 static lck_mtx_t        dtrace_provider_lock;   /* provider state lock */
 292 static lck_mtx_t        dtrace_meta_lock;       /* meta-provider state lock */
 293 static lck_rw_t         dtrace_dof_mode_lock;   /* dof mode lock */
 294
 295 /*
 296  * DTrace Provider Variables
 297  *
 298  * These are the variables relating to DTrace as a provider (that is, the
 299  * provider of the BEGIN, END, and ERROR probes).
 300  */
 301 static dtrace_pattr_t   dtrace_provider_attr = {
 302 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 303 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 304 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 305 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 306 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 307 };
 308
 309 static void
 310 dtrace_nullop(void)
 311 {}
 312
 313 static int
 314 dtrace_enable_nullop(void)
 315 {
 316     return (0);
 317 }
 318
 319 static dtrace_pops_t    dtrace_provider_ops = {
 320         (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
 321         (void (*)(void *, struct modctl *))dtrace_nullop,
 322         (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
 323         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 324         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 325         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 326         NULL,
 327         NULL,
 328         NULL,
 329         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
 330 };
 331
 332 static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
 333 static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
 334 dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
 335
 336 /*
 337  * DTrace Helper Tracing Variables
 338  */
 339 uint32_t dtrace_helptrace_next = 0;
 340 uint32_t dtrace_helptrace_nlocals;
 341 char    *dtrace_helptrace_buffer;
 342 size_t  dtrace_helptrace_bufsize = 512 * 1024;
 343
 344 #if DEBUG
 345 int     dtrace_helptrace_enabled = 1;
 346 #else
 347 int     dtrace_helptrace_enabled = 0;
 348 #endif
 349
 350
 351 /*
 352  * DTrace Error Hashing
 353  *
 354  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
 355  * table.  This is very useful for checking coverage of tests that are
 356  * expected to induce DIF or DOF processing errors, and may be useful for
 357  * debugging problems in the DIF code generator or in DOF generation .  The
 358  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
 359  */
 360 #if DEBUG
 361 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
 362 static const char *dtrace_errlast;
 363 static kthread_t *dtrace_errthread;
 364 static lck_mtx_t dtrace_errlock;
 365 #endif
 366
 367 /*
 368  * DTrace Macros and Constants
 369  *
 370  * These are various macros that are useful in various spots in the
 371  * implementation, along with a few random constants that have no meaning
 372  * outside of the implementation.  There is no real structure to this cpp
 373  * mishmash -- but is there ever?
 374  */
 375 #define DTRACE_HASHSTR(hash, probe)     \
 376         dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
 377
 378 #define DTRACE_HASHNEXT(hash, probe)    \
 379         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
 380
 381 #define DTRACE_HASHPREV(hash, probe)    \
 382         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
 383
 384 #define DTRACE_HASHEQ(hash, lhs, rhs)   \
 385         (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
 386             *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
 387
 388 #define DTRACE_AGGHASHSIZE_SLEW         17
 389
 390 #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
 391
 392 /*
 393  * The key for a thread-local variable consists of the lower 61 bits of the
 394  * current_thread(), plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
 395  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
 396  * equal to a variable identifier.  This is necessary (but not sufficient) to
 397  * assure that global associative arrays never collide with thread-local
 398  * variables.  To guarantee that they cannot collide, we must also define the
 399  * order for keying dynamic variables.  That order is:
 400  *
 401  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
 402  *
 403  * Because the variable-key and the tls-key are in orthogonal spaces, there is
 404  * no way for a global variable key signature to match a thread-local key
 405  * signature.
 406  */
 407 #if defined (__x86_64__)
 408 /* FIXME: two function calls!! */
 409 #define DTRACE_TLS_THRKEY(where) { \
 410         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 411         uint64_t thr = (uintptr_t)current_thread(); \
 412         ASSERT(intr < (1 << 3)); \
 413         (where) = ((thr + DIF_VARIABLE_MAX) & \
 414             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 415 }
 416 #else
 417 #error Unknown architecture
 418 #endif
 419
 420 #define DT_BSWAP_8(x)   ((x) & 0xff)
 421 #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
 422 #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
 423 #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
 424
 425 #define DT_MASK_LO 0x00000000FFFFFFFFULL
 426
 427 #define DTRACE_STORE(type, tomax, offset, what) \
 428         *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
 429
 430
 431 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
 432         if (addr & (MIN(size,4) - 1)) {                                 \
 433                 *flags |= CPU_DTRACE_BADALIGN;                          \
 434                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 435                 return (0);                                             \
 436         }
 437
 438 /*
 439  * Test whether a range of memory starting at testaddr of size testsz falls
 440  * within the range of memory described by addr, sz.  We take care to avoid
 441  * problems with overflow and underflow of the unsigned quantities, and
 442  * disallow all negative sizes.  Ranges of size 0 are allowed.
 443  */
 444 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
 445         ((testaddr) - (baseaddr) < (basesz) && \
 446         (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
 447         (testaddr) + (testsz) >= (testaddr))
 448
 449 /*
 450  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
 451  * alloc_sz on the righthand side of the comparison in order to avoid overflow
 452  * or underflow in the comparison with it.  This is simpler than the INRANGE
 453  * check above, because we know that the dtms_scratch_ptr is valid in the
 454  * range.  Allocations of size zero are allowed.
 455  */
 456 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
 457         ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
 458         (mstate)->dtms_scratch_ptr >= (alloc_sz))
 459
 460 #define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
 461
 462 #if defined (__x86_64__)
 463 #define DTRACE_LOADFUNC(bits)                                           \
 464 /*CSTYLED*/                                                             \
 465 uint##bits##_t dtrace_load##bits(uintptr_t addr);                       \
 466                                                                         \
 467 uint##bits##_t                                                          \
 468 dtrace_load##bits(uintptr_t addr)                                       \
 469 {                                                                       \
 470         size_t size = bits / NBBY;                                      \
 471         /*CSTYLED*/                                                     \
 472         uint##bits##_t rval = 0;                                        \
 473         int i;                                                          \
 474         volatile uint16_t *flags = (volatile uint16_t *)                \
 475             &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;                   \
 476                                                                         \
 477         DTRACE_ALIGNCHECK(addr, size, flags);                           \
 478                                                                         \
 479         for (i = 0; i < dtrace_toxranges; i++) {                        \
 480                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
 481                         continue;                                       \
 482                                                                         \
 483                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
 484                         continue;                                       \
 485                                                                         \
 486                 /*                                                      \
 487                  * This address falls within a toxic region; return 0.  \
 488                  */                                                     \
 489                 *flags |= CPU_DTRACE_BADADDR;                           \
 490                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 491                 return (0);                                             \
 492         }                                                               \
 493                                                                         \
 494         {                                                               \
 495         volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits;          \
 496         *flags |= CPU_DTRACE_NOFAULT;                                   \
 497         recover = dtrace_set_thread_recover(current_thread(), recover); \
 498         /*CSTYLED*/                                                     \
 499         /*                                                              \
 500         * PR6394061 - avoid device memory that is unpredictably         \
 501         * mapped and unmapped                                           \
 502         */                                                              \
 503         if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr)))         \
 504             rval = *((volatile uint##bits##_t *)addr);                  \
 505         RECOVER_LABEL(bits);                                            \
 506         (void)dtrace_set_thread_recover(current_thread(), recover);     \
 507         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
 508         }                                                               \
 509                                                                         \
 510         return (rval);                                                  \
 511 }
 512 #else /* all other architectures */
 513 #error Unknown Architecture
 514 #endif
 515
 516 #ifdef __LP64__
 517 #define dtrace_loadptr  dtrace_load64
 518 #else
 519 #define dtrace_loadptr  dtrace_load32
 520 #endif
 521
 522 #define DTRACE_DYNHASH_FREE     0
 523 #define DTRACE_DYNHASH_SINK     1
 524 #define DTRACE_DYNHASH_VALID    2
 525
 526 #define DTRACE_MATCH_FAIL       -1
 527 #define DTRACE_MATCH_NEXT       0
 528 #define DTRACE_MATCH_DONE       1
 529 #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
 530 #define DTRACE_STATE_ALIGN      64
 531
 532 #define DTRACE_FLAGS2FLT(flags)                                         \
 533         (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :           \
 534         ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :                \
 535         ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :            \
 536         ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :                \
 537         ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :                \
 538         ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :         \
 539         ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :         \
 540         ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :       \
 541         ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :         \
 542         DTRACEFLT_UNKNOWN)
 543
 544 #define DTRACEACT_ISSTRING(act)                                         \
 545         ((act)->dta_kind == DTRACEACT_DIFEXPR &&                        \
 546         (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
 547
 548
 549 static size_t dtrace_strlen(const char *, size_t);
 550 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
 551 static void dtrace_enabling_provide(dtrace_provider_t *);
 552 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
 553 static void dtrace_enabling_matchall(void);
 554 static dtrace_state_t *dtrace_anon_grab(void);
 555 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
 556     dtrace_state_t *, uint64_t, uint64_t);
 557 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
 558 static void dtrace_buffer_drop(dtrace_buffer_t *);
 559 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
 560     dtrace_state_t *, dtrace_mstate_t *);
 561 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
 562     dtrace_optval_t);
 563 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
 564 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
 565
 566
 567 /*
 568  * DTrace sysctl handlers
 569  *
 570  * These declarations and functions are used for a deeper DTrace configuration.
 571  * Most of them are not per-consumer basis and may impact the other DTrace
 572  * consumers.  Correctness may not be supported for all the variables, so you
 573  * should be careful about what values you are using.
 574  */
 575
 576 SYSCTL_DECL(_kern_dtrace);
 577 SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "dtrace");
 578
 579 static int
 580 sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
 581 {
 582 #pragma unused(oidp, arg2)
 583         int changed, error;
 584         int value = *(int *) arg1;
 585
 586         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 587         if (error || !changed)
 588                 return (error);
 589
 590         if (value != 0 && value != 1)
 591                 return (ERANGE);
 592
 593         lck_mtx_lock(&dtrace_lock);
 594                 dtrace_err_verbose = value;
 595         lck_mtx_unlock(&dtrace_lock);
 596
 597         return (0);
 598 }
 599
 600 /*
 601  * kern.dtrace.err_verbose
 602  *
 603  * Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
 604  * Errors are reported when a DIFO or a DOF has been rejected by the kernel.
 605  */
 606 SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
 607         CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 608         &dtrace_err_verbose, 0,
 609         sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
 610
 611 static int
 612 sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
 613 {
 614 #pragma unused(oidp, arg2, req)
 615         int changed, error;
 616         uint64_t value = *(uint64_t *) arg1;
 617
 618         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 619         if (error || !changed)
 620                 return (error);
 621
 622         if (value <= dtrace_buffer_memory_inuse)
 623                 return (ERANGE);
 624
 625         lck_mtx_lock(&dtrace_lock);
 626                 dtrace_buffer_memory_maxsize = value;
 627         lck_mtx_unlock(&dtrace_lock);
 628
 629         return (0);
 630 }
 631
 632 /*
 633  * kern.dtrace.buffer_memory_maxsize
 634  *
 635  * Set DTrace maximal size in bytes used by all the consumers' state buffers.  By default
 636  * the limit is PHYS_MEM / 3 for *all* consumers.  Attempting to set a null, a negative value
 637  * or a value <= to dtrace_buffer_memory_inuse will result in a failure.
 638  */
 639 SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
 640         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 641         &dtrace_buffer_memory_maxsize, 0,
 642         sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
 643
 644 /*
 645  * kern.dtrace.buffer_memory_inuse
 646  *
 647  * Current state buffer memory used, in bytes, by all the DTrace consumers.
 648  * This value is read-only.
 649  */
 650 SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD | CTLFLAG_LOCKED,
 651         &dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
 652
 653 static int
 654 sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
 655 {
 656 #pragma unused(oidp, arg2, req)
 657         int changed, error;
 658         size_t value = *(size_t*) arg1;
 659
 660         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 661         if (error || !changed)
 662                 return (error);
 663
 664         if (value <= 0)
 665                 return (ERANGE);
 666
 667         lck_mtx_lock(&dtrace_lock);
 668                 dtrace_difo_maxsize = value;
 669         lck_mtx_unlock(&dtrace_lock);
 670
 671         return (0);
 672 }
 673
 674 /*
 675  * kern.dtrace.difo_maxsize
 676  *
 677  * Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
 678  * to get the default value.  Attempting to set a null or negative size will
 679  * result in a failure.
 680  */
 681 SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
 682         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 683         &dtrace_difo_maxsize, 0,
 684         sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
 685
 686 static int
 687 sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
 688 {
 689 #pragma unused(oidp, arg2, req)
 690         int changed, error;
 691         dtrace_optval_t value = *(dtrace_optval_t *) arg1;
 692
 693         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 694         if (error || !changed)
 695                 return (error);
 696
 697         if (value <= 0)
 698                 return (ERANGE);
 699
 700         lck_mtx_lock(&dtrace_lock);
 701                 dtrace_dof_maxsize = value;
 702         lck_mtx_unlock(&dtrace_lock);
 703
 704         return (0);
 705 }
 706
 707 /*
 708  * kern.dtrace.dof_maxsize
 709  *
 710  * Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
 711  * get the default value.  Attempting to set a null or negative size will result
 712  * in a failure.
 713  */
 714 SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
 715         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 716         &dtrace_dof_maxsize, 0,
 717         sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
 718
 719 static int
 720 sysctl_dtrace_global_maxsize SYSCTL_HANDLER_ARGS
 721 {
 722 #pragma unused(oidp, arg2, req)
 723         int changed, error;
 724         dtrace_optval_t value = *(dtrace_optval_t*) arg1;
 725
 726         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 727         if (error || !changed)
 728                 return (error);
 729
 730         if (value <= 0)
 731                 return (ERANGE);
 732
 733         lck_mtx_lock(&dtrace_lock);
 734                 dtrace_global_maxsize = value;
 735         lck_mtx_unlock(&dtrace_lock);
 736
 737         return (0);
 738 }
 739
 740 /*
 741  * kern.dtrace.global_maxsize
 742  *
 743  * Set the global variable max size in bytes, check the definition of
 744  * dtrace_global_maxsize to get the default value.  Attempting to set a null or
 745  * negative size will result in a failure.
 746  */
 747 SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
 748         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 749         &dtrace_global_maxsize, 0,
 750         sysctl_dtrace_global_maxsize, "Q", "dtrace global maxsize");
 751
 752 static int
 753 sysctl_dtrace_provide_private_probes SYSCTL_HANDLER_ARGS
 754 {
 755 #pragma unused(oidp, arg2)
 756         int error;
 757         int value = *(int *) arg1;
 758
 759         error = sysctl_io_number(req, value, sizeof(value), &value, NULL);
 760         if (error)
 761                 return (error);
 762
 763         if (value != 0 && value != 1)
 764                 return (ERANGE);
 765
 766         lck_mtx_lock(&dtrace_lock);
 767                 dtrace_provide_private_probes = value;
 768         lck_mtx_unlock(&dtrace_lock);
 769
 770         return (0);
 771 }
 772
 773 /*
 774  * kern.dtrace.provide_private_probes
 775  *
 776  * Set whether the providers must provide the private probes.  This is
 777  * mainly used by the FBT provider to request probes for the private/static
 778  * symbols.
 779  */
 780 SYSCTL_PROC(_kern_dtrace, OID_AUTO, provide_private_probes,
 781         CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 782         &dtrace_provide_private_probes, 0,
 783         sysctl_dtrace_provide_private_probes, "I", "provider must provide the private probes");
 784
 785 /*
 786  * DTrace Probe Context Functions
 787  *
 788  * These functions are called from probe context.  Because probe context is
 789  * any context in which C may be called, arbitrarily locks may be held,
 790  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
 791  * As a result, functions called from probe context may only call other DTrace
 792  * support functions -- they may not interact at all with the system at large.
 793  * (Note that the ASSERT macro is made probe-context safe by redefining it in
 794  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
 795  * loads are to be performed from probe context, they _must_ be in terms of
 796  * the safe dtrace_load*() variants.
 797  *
 798  * Some functions in this block are not actually called from probe context;
 799  * for these functions, there will be a comment above the function reading
 800  * "Note:  not called from probe context."
 801  */
 802
 803 int
 804 dtrace_assfail(const char *a, const char *f, int l)
 805 {
 806         panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
 807
 808         /*
 809          * We just need something here that even the most clever compiler
 810          * cannot optimize away.
 811          */
 812         return (a[(uintptr_t)f]);
 813 }
 814
 815 /*
 816  * Atomically increment a specified error counter from probe context.
 817  */
 818 static void
 819 dtrace_error(uint32_t *counter)
 820 {
 821         /*
 822          * Most counters stored to in probe context are per-CPU counters.
 823          * However, there are some error conditions that are sufficiently
 824          * arcane that they don't merit per-CPU storage.  If these counters
 825          * are incremented concurrently on different CPUs, scalability will be
 826          * adversely affected -- but we don't expect them to be white-hot in a
 827          * correctly constructed enabling...
 828          */
 829         uint32_t oval, nval;
 830
 831         do {
 832                 oval = *counter;
 833
 834                 if ((nval = oval + 1) == 0) {
 835                         /*
 836                          * If the counter would wrap, set it to 1 -- assuring
 837                          * that the counter is never zero when we have seen
 838                          * errors.  (The counter must be 32-bits because we
 839                          * aren't guaranteed a 64-bit compare&swap operation.)
 840                          * To save this code both the infamy of being fingered
 841                          * by a priggish news story and the indignity of being
 842                          * the target of a neo-puritan witch trial, we're
 843                          * carefully avoiding any colorful description of the
 844                          * likelihood of this condition -- but suffice it to
 845                          * say that it is only slightly more likely than the
 846                          * overflow of predicate cache IDs, as discussed in
 847                          * dtrace_predicate_create().
 848                          */
 849                         nval = 1;
 850                 }
 851         } while (dtrace_cas32(counter, oval, nval) != oval);
 852 }
 853
 854 /*
 855  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
 856  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
 857  */
 858 DTRACE_LOADFUNC(8)
 859 DTRACE_LOADFUNC(16)
 860 DTRACE_LOADFUNC(32)
 861 DTRACE_LOADFUNC(64)
 862
 863 static int
 864 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
 865 {
 866         if (dest < mstate->dtms_scratch_base)
 867                 return (0);
 868
 869         if (dest + size < dest)
 870                 return (0);
 871
 872         if (dest + size > mstate->dtms_scratch_ptr)
 873                 return (0);
 874
 875         return (1);
 876 }
 877
 878 static int
 879 dtrace_canstore_statvar(uint64_t addr, size_t sz,
 880     dtrace_statvar_t **svars, int nsvars)
 881 {
 882         int i;
 883
 884         for (i = 0; i < nsvars; i++) {
 885                 dtrace_statvar_t *svar = svars[i];
 886
 887                 if (svar == NULL || svar->dtsv_size == 0)
 888                         continue;
 889
 890                 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
 891                         return (1);
 892         }
 893
 894         return (0);
 895 }
 896
 897 /*
 898  * Check to see if the address is within a memory region to which a store may
 899  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
 900  * region.  The caller of dtrace_canstore() is responsible for performing any
 901  * alignment checks that are needed before stores are actually executed.
 902  */
 903 static int
 904 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 905     dtrace_vstate_t *vstate)
 906 {
 907         /*
 908          * First, check to see if the address is in scratch space...
 909          */
 910         if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
 911             mstate->dtms_scratch_size))
 912                 return (1);
 913
 914         /*
 915          * Now check to see if it's a dynamic variable.  This check will pick
 916          * up both thread-local variables and any global dynamically-allocated
 917          * variables.
 918          */
 919         if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
 920             vstate->dtvs_dynvars.dtds_size)) {
 921                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
 922                 uintptr_t base = (uintptr_t)dstate->dtds_base +
 923                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
 924                 uintptr_t chunkoffs;
 925
 926                 /*
 927                  * Before we assume that we can store here, we need to make
 928                  * sure that it isn't in our metadata -- storing to our
 929                  * dynamic variable metadata would corrupt our state.  For
 930                  * the range to not include any dynamic variable metadata,
 931                  * it must:
 932                  *
 933                  *      (1) Start above the hash table that is at the base of
 934                  *      the dynamic variable space
 935                  *
 936                  *      (2) Have a starting chunk offset that is beyond the
 937                  *      dtrace_dynvar_t that is at the base of every chunk
 938                  *
 939                  *      (3) Not span a chunk boundary
 940                  *
 941                  */
 942                 if (addr < base)
 943                         return (0);
 944
 945                 chunkoffs = (addr - base) % dstate->dtds_chunksize;
 946
 947                 if (chunkoffs < sizeof (dtrace_dynvar_t))
 948                         return (0);
 949
 950                 if (chunkoffs + sz > dstate->dtds_chunksize)
 951                         return (0);
 952
 953                 return (1);
 954         }
 955
 956         /*
 957          * Finally, check the static local and global variables.  These checks
 958          * take the longest, so we perform them last.
 959          */
 960         if (dtrace_canstore_statvar(addr, sz,
 961             vstate->dtvs_locals, vstate->dtvs_nlocals))
 962                 return (1);
 963
 964         if (dtrace_canstore_statvar(addr, sz,
 965             vstate->dtvs_globals, vstate->dtvs_nglobals))
 966                 return (1);
 967
 968         return (0);
 969 }
 970
 971
 972 /*
 973  * Convenience routine to check to see if the address is within a memory
 974  * region in which a load may be issued given the user's privilege level;
 975  * if not, it sets the appropriate error flags and loads 'addr' into the
 976  * illegal value slot.
 977  *
 978  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
 979  * appropriate memory access protection.
 980  */
 981 static int
 982 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 983     dtrace_vstate_t *vstate)
 984 {
 985         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
 986
 987         /*
 988          * If we hold the privilege to read from kernel memory, then
 989          * everything is readable.
 990          */
 991         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 992                 return (1);
 993
 994         /*
 995          * You can obviously read that which you can store.
 996          */
 997         if (dtrace_canstore(addr, sz, mstate, vstate))
 998                 return (1);
 999
1000         /*
1001          * We're allowed to read from our own string table.
1002          */
1003         if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1004             mstate->dtms_difo->dtdo_strlen))
1005                 return (1);
1006
1007         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1008         *illval = addr;
1009         return (0);
1010 }
1011
1012 /*
1013  * Convenience routine to check to see if a given string is within a memory
1014  * region in which a load may be issued given the user's privilege level;
1015  * this exists so that we don't need to issue unnecessary dtrace_strlen()
1016  * calls in the event that the user has all privileges.
1017  */
1018 static int
1019 dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1020     dtrace_vstate_t *vstate)
1021 {
1022         size_t strsz;
1023
1024         /*
1025          * If we hold the privilege to read from kernel memory, then
1026          * everything is readable.
1027          */
1028         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
1029                 return (1);
1030
1031         strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
1032         if (dtrace_canload(addr, strsz, mstate, vstate))
1033                 return (1);
1034
1035         return (0);
1036 }
1037
1038 /*
1039  * Convenience routine to check to see if a given variable is within a memory
1040  * region in which a load may be issued given the user's privilege level.
1041  */
1042 static int
1043 dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
1044     dtrace_vstate_t *vstate)
1045 {
1046         size_t sz;
1047         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1048
1049         /*
1050          * If we hold the privilege to read from kernel memory, then
1051          * everything is readable.
1052          */
1053         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
1054                 return (1);
1055
1056         if (type->dtdt_kind == DIF_TYPE_STRING)
1057                 sz = dtrace_strlen(src,
1058                     vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
1059         else
1060                 sz = type->dtdt_size;
1061
1062         return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
1063 }
1064
1065 /*
1066  * Compare two strings using safe loads.
1067  */
1068 static int
1069 dtrace_strncmp(char *s1, char *s2, size_t limit)
1070 {
1071         uint8_t c1, c2;
1072         volatile uint16_t *flags;
1073
1074         if (s1 == s2 || limit == 0)
1075                 return (0);
1076
1077         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1078
1079         do {
1080                 if (s1 == NULL) {
1081                         c1 = '\0';
1082                 } else {
1083                         c1 = dtrace_load8((uintptr_t)s1++);
1084                 }
1085
1086                 if (s2 == NULL) {
1087                         c2 = '\0';
1088                 } else {
1089                         c2 = dtrace_load8((uintptr_t)s2++);
1090                 }
1091
1092                 if (c1 != c2)
1093                         return (c1 - c2);
1094         } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1095
1096         return (0);
1097 }
1098
1099 /*
1100  * Compute strlen(s) for a string using safe memory accesses.  The additional
1101  * len parameter is used to specify a maximum length to ensure completion.
1102  */
1103 static size_t
1104 dtrace_strlen(const char *s, size_t lim)
1105 {
1106         uint_t len;
1107
1108         for (len = 0; len != lim; len++) {
1109                 if (dtrace_load8((uintptr_t)s++) == '\0')
1110                         break;
1111         }
1112
1113         return (len);
1114 }
1115
1116 /*
1117  * Check if an address falls within a toxic region.
1118  */
1119 static int
1120 dtrace_istoxic(uintptr_t kaddr, size_t size)
1121 {
1122         uintptr_t taddr, tsize;
1123         int i;
1124
1125         for (i = 0; i < dtrace_toxranges; i++) {
1126                 taddr = dtrace_toxrange[i].dtt_base;
1127                 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1128
1129                 if (kaddr - taddr < tsize) {
1130                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1131                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1132                         return (1);
1133                 }
1134
1135                 if (taddr - kaddr < size) {
1136                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1137                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1138                         return (1);
1139                 }
1140         }
1141
1142         return (0);
1143 }
1144
1145 /*
1146  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1147  * memory specified by the DIF program.  The dst is assumed to be safe memory
1148  * that we can store to directly because it is managed by DTrace.  As with
1149  * standard bcopy, overlapping copies are handled properly.
1150  */
1151 static void
1152 dtrace_bcopy(const void *src, void *dst, size_t len)
1153 {
1154         if (len != 0) {
1155                 uint8_t *s1 = dst;
1156                 const uint8_t *s2 = src;
1157
1158                 if (s1 <= s2) {
1159                         do {
1160                                 *s1++ = dtrace_load8((uintptr_t)s2++);
1161                         } while (--len != 0);
1162                 } else {
1163                         s2 += len;
1164                         s1 += len;
1165
1166                         do {
1167                                 *--s1 = dtrace_load8((uintptr_t)--s2);
1168                         } while (--len != 0);
1169                 }
1170         }
1171 }
1172
1173 /*
1174  * Copy src to dst using safe memory accesses, up to either the specified
1175  * length, or the point that a nul byte is encountered.  The src is assumed to
1176  * be unsafe memory specified by the DIF program.  The dst is assumed to be
1177  * safe memory that we can store to directly because it is managed by DTrace.
1178  * Unlike dtrace_bcopy(), overlapping regions are not handled.
1179  */
1180 static void
1181 dtrace_strcpy(const void *src, void *dst, size_t len)
1182 {
1183         if (len != 0) {
1184                 uint8_t *s1 = dst, c;
1185                 const uint8_t *s2 = src;
1186
1187                 do {
1188                         *s1++ = c = dtrace_load8((uintptr_t)s2++);
1189                 } while (--len != 0 && c != '\0');
1190         }
1191 }
1192
1193 /*
1194  * Copy src to dst, deriving the size and type from the specified (BYREF)
1195  * variable type.  The src is assumed to be unsafe memory specified by the DIF
1196  * program.  The dst is assumed to be DTrace variable memory that is of the
1197  * specified type; we assume that we can store to directly.
1198  */
1199 static void
1200 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1201 {
1202         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1203
1204         if (type->dtdt_kind == DIF_TYPE_STRING) {
1205                 dtrace_strcpy(src, dst, type->dtdt_size);
1206         } else {
1207                 dtrace_bcopy(src, dst, type->dtdt_size);
1208 }
1209 }
1210
1211 /*
1212  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1213  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1214  * safe memory that we can access directly because it is managed by DTrace.
1215  */
1216 static int
1217 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1218 {
1219         volatile uint16_t *flags;
1220
1221         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1222
1223         if (s1 == s2)
1224                 return (0);
1225
1226         if (s1 == NULL || s2 == NULL)
1227                 return (1);
1228
1229         if (s1 != s2 && len != 0) {
1230                 const uint8_t *ps1 = s1;
1231                 const uint8_t *ps2 = s2;
1232
1233                 do {
1234                         if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1235                                 return (1);
1236                 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1237         }
1238         return (0);
1239 }
1240
1241 /*
1242  * Zero the specified region using a simple byte-by-byte loop.  Note that this
1243  * is for safe DTrace-managed memory only.
1244  */
1245 static void
1246 dtrace_bzero(void *dst, size_t len)
1247 {
1248         uchar_t *cp;
1249
1250         for (cp = dst; len != 0; len--)
1251                 *cp++ = 0;
1252 }
1253
1254 static void
1255 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1256 {
1257         uint64_t result[2];
1258
1259         result[0] = addend1[0] + addend2[0];
1260         result[1] = addend1[1] + addend2[1] +
1261             (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1262
1263         sum[0] = result[0];
1264         sum[1] = result[1];
1265 }
1266
1267 /*
1268  * Shift the 128-bit value in a by b. If b is positive, shift left.
1269  * If b is negative, shift right.
1270  */
1271 static void
1272 dtrace_shift_128(uint64_t *a, int b)
1273 {
1274         uint64_t mask;
1275
1276         if (b == 0)
1277                 return;
1278
1279         if (b < 0) {
1280                 b = -b;
1281                 if (b >= 64) {
1282                         a[0] = a[1] >> (b - 64);
1283                         a[1] = 0;
1284                 } else {
1285                         a[0] >>= b;
1286                         mask = 1LL << (64 - b);
1287                         mask -= 1;
1288                         a[0] |= ((a[1] & mask) << (64 - b));
1289                         a[1] >>= b;
1290                 }
1291         } else {
1292                 if (b >= 64) {
1293                         a[1] = a[0] << (b - 64);
1294                         a[0] = 0;
1295                 } else {
1296                         a[1] <<= b;
1297                         mask = a[0] >> (64 - b);
1298                         a[1] |= mask;
1299                         a[0] <<= b;
1300                 }
1301         }
1302 }
1303
1304 /*
1305  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1306  * use native multiplication on those, and then re-combine into the
1307  * resulting 128-bit value.
1308  *
1309  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1310  *     hi1 * hi2 << 64 +
1311  *     hi1 * lo2 << 32 +
1312  *     hi2 * lo1 << 32 +
1313  *     lo1 * lo2
1314  */
1315 static void
1316 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1317 {
1318         uint64_t hi1, hi2, lo1, lo2;
1319         uint64_t tmp[2];
1320
1321         hi1 = factor1 >> 32;
1322         hi2 = factor2 >> 32;
1323
1324         lo1 = factor1 & DT_MASK_LO;
1325         lo2 = factor2 & DT_MASK_LO;
1326
1327         product[0] = lo1 * lo2;
1328         product[1] = hi1 * hi2;
1329
1330         tmp[0] = hi1 * lo2;
1331         tmp[1] = 0;
1332         dtrace_shift_128(tmp, 32);
1333         dtrace_add_128(product, tmp, product);
1334
1335         tmp[0] = hi2 * lo1;
1336         tmp[1] = 0;
1337         dtrace_shift_128(tmp, 32);
1338         dtrace_add_128(product, tmp, product);
1339 }
1340
1341 /*
1342  * This privilege check should be used by actions and subroutines to
1343  * verify that the user credentials of the process that enabled the
1344  * invoking ECB match the target credentials
1345  */
1346 static int
1347 dtrace_priv_proc_common_user(dtrace_state_t *state)
1348 {
1349         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1350
1351         /*
1352          * We should always have a non-NULL state cred here, since if cred
1353          * is null (anonymous tracing), we fast-path bypass this routine.
1354          */
1355         ASSERT(s_cr != NULL);
1356
1357         if ((cr = dtrace_CRED()) != NULL &&
1358             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1359             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1360             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1361             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1362             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1363             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1364                 return (1);
1365
1366         return (0);
1367 }
1368
1369 /*
1370  * This privilege check should be used by actions and subroutines to
1371  * verify that the zone of the process that enabled the invoking ECB
1372  * matches the target credentials
1373  */
1374 static int
1375 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1376 {
1377         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1378 #pragma unused(cr, s_cr, state) /* __APPLE__ */
1379
1380         /*
1381          * We should always have a non-NULL state cred here, since if cred
1382          * is null (anonymous tracing), we fast-path bypass this routine.
1383          */
1384         ASSERT(s_cr != NULL);
1385
1386         return 1; /* APPLE NOTE: Darwin doesn't do zones. */
1387 }
1388
1389 /*
1390  * This privilege check should be used by actions and subroutines to
1391  * verify that the process has not setuid or changed credentials.
1392  */
1393 static int
1394 dtrace_priv_proc_common_nocd(void)
1395 {
1396         return 1; /* Darwin omits "No Core Dump" flag. */
1397 }
1398
1399 static int
1400 dtrace_priv_proc_destructive(dtrace_state_t *state)
1401 {
1402         int action = state->dts_cred.dcr_action;
1403
1404         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1405                 goto bad;
1406
1407         if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1408                 goto bad;
1409
1410         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1411             dtrace_priv_proc_common_zone(state) == 0)
1412                 goto bad;
1413
1414         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1415             dtrace_priv_proc_common_user(state) == 0)
1416                 goto bad;
1417
1418         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1419             dtrace_priv_proc_common_nocd() == 0)
1420                 goto bad;
1421
1422         return (1);
1423
1424 bad:
1425         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1426
1427         return (0);
1428 }
1429
1430 static int
1431 dtrace_priv_proc_control(dtrace_state_t *state)
1432 {
1433         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1434                 goto bad;
1435
1436         if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1437                 goto bad;
1438
1439         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1440                 return (1);
1441
1442         if (dtrace_priv_proc_common_zone(state) &&
1443             dtrace_priv_proc_common_user(state) &&
1444             dtrace_priv_proc_common_nocd())
1445                 return (1);
1446
1447 bad:
1448         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1449
1450         return (0);
1451 }
1452
1453 static int
1454 dtrace_priv_proc(dtrace_state_t *state)
1455 {
1456         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1457                 goto bad;
1458
1459         if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1460                 goto bad;
1461
1462         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1463                 return (1);
1464
1465 bad:
1466         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1467
1468         return (0);
1469 }
1470
1471 /*
1472  * The P_LNOATTACH check is an Apple specific check.
1473  * We need a version of dtrace_priv_proc() that omits
1474  * that check for PID and EXECNAME accesses
1475  */
1476 static int
1477 dtrace_priv_proc_relaxed(dtrace_state_t *state)
1478 {
1479
1480         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1481                 return (1);
1482
1483         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1484
1485         return (0);
1486 }
1487
1488 static int
1489 dtrace_priv_kernel(dtrace_state_t *state)
1490 {
1491         if (dtrace_is_restricted())
1492                 goto bad;
1493
1494         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1495                 return (1);
1496
1497 bad:
1498         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1499
1500         return (0);
1501 }
1502
1503 static int
1504 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1505 {
1506         if (dtrace_is_restricted())
1507                 goto bad;
1508
1509         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1510                 return (1);
1511
1512 bad:
1513         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1514
1515         return (0);
1516 }
1517
1518 /*
1519  * Note:  not called from probe context.  This function is called
1520  * asynchronously (and at a regular interval) from outside of probe context to
1521  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1522  * cleaning is explained in detail in <sys/dtrace_impl.h>.
1523  */
1524 static void
1525 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1526 {
1527         dtrace_dynvar_t *dirty;
1528         dtrace_dstate_percpu_t *dcpu;
1529         int i, work = 0;
1530
1531         for (i = 0; i < (int)NCPU; i++) {
1532                 dcpu = &dstate->dtds_percpu[i];
1533
1534                 ASSERT(dcpu->dtdsc_rinsing == NULL);
1535
1536                 /*
1537                  * If the dirty list is NULL, there is no dirty work to do.
1538                  */
1539                 if (dcpu->dtdsc_dirty == NULL)
1540                         continue;
1541
1542                 /*
1543                  * If the clean list is non-NULL, then we're not going to do
1544                  * any work for this CPU -- it means that there has not been
1545                  * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1546                  * since the last time we cleaned house.
1547                  */
1548                 if (dcpu->dtdsc_clean != NULL)
1549                         continue;
1550
1551                 work = 1;
1552
1553                 /*
1554                  * Atomically move the dirty list aside.
1555                  */
1556                 do {
1557                         dirty = dcpu->dtdsc_dirty;
1558
1559                         /*
1560                          * Before we zap the dirty list, set the rinsing list.
1561                          * (This allows for a potential assertion in
1562                          * dtrace_dynvar():  if a free dynamic variable appears
1563                          * on a hash chain, either the dirty list or the
1564                          * rinsing list for some CPU must be non-NULL.)
1565                          */
1566                         dcpu->dtdsc_rinsing = dirty;
1567                         dtrace_membar_producer();
1568                 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1569                     dirty, NULL) != dirty);
1570         }
1571
1572         if (!work) {
1573                 /*
1574                  * We have no work to do; we can simply return.
1575                  */
1576                 return;
1577         }
1578
1579         dtrace_sync();
1580
1581         for (i = 0; i < (int)NCPU; i++) {
1582                 dcpu = &dstate->dtds_percpu[i];
1583
1584                 if (dcpu->dtdsc_rinsing == NULL)
1585                         continue;
1586
1587                 /*
1588                  * We are now guaranteed that no hash chain contains a pointer
1589                  * into this dirty list; we can make it clean.
1590                  */
1591                 ASSERT(dcpu->dtdsc_clean == NULL);
1592                 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1593                 dcpu->dtdsc_rinsing = NULL;
1594         }
1595
1596         /*
1597          * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1598          * sure that all CPUs have seen all of the dtdsc_clean pointers.
1599          * This prevents a race whereby a CPU incorrectly decides that
1600          * the state should be something other than DTRACE_DSTATE_CLEAN
1601          * after dtrace_dynvar_clean() has completed.
1602          */
1603         dtrace_sync();
1604
1605         dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1606 }
1607
1608 /*
1609  * Depending on the value of the op parameter, this function looks-up,
1610  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1611  * allocation is requested, this function will return a pointer to a
1612  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1613  * variable can be allocated.  If NULL is returned, the appropriate counter
1614  * will be incremented.
1615  */
1616 static dtrace_dynvar_t *
1617 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1618     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1619     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1620 {
1621         uint64_t hashval = DTRACE_DYNHASH_VALID;
1622         dtrace_dynhash_t *hash = dstate->dtds_hash;
1623         dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1624         processorid_t me = CPU->cpu_id, cpu = me;
1625         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1626         size_t bucket, ksize;
1627         size_t chunksize = dstate->dtds_chunksize;
1628         uintptr_t kdata, lock, nstate;
1629         uint_t i;
1630
1631         ASSERT(nkeys != 0);
1632
1633         /*
1634          * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1635          * algorithm.  For the by-value portions, we perform the algorithm in
1636          * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1637          * bit, and seems to have only a minute effect on distribution.  For
1638          * the by-reference data, we perform "One-at-a-time" iterating (safely)
1639          * over each referenced byte.  It's painful to do this, but it's much
1640          * better than pathological hash distribution.  The efficacy of the
1641          * hashing algorithm (and a comparison with other algorithms) may be
1642          * found by running the ::dtrace_dynstat MDB dcmd.
1643          */
1644         for (i = 0; i < nkeys; i++) {
1645                 if (key[i].dttk_size == 0) {
1646                         uint64_t val = key[i].dttk_value;
1647
1648                         hashval += (val >> 48) & 0xffff;
1649                         hashval += (hashval << 10);
1650                         hashval ^= (hashval >> 6);
1651
1652                         hashval += (val >> 32) & 0xffff;
1653                         hashval += (hashval << 10);
1654                         hashval ^= (hashval >> 6);
1655
1656                         hashval += (val >> 16) & 0xffff;
1657                         hashval += (hashval << 10);
1658                         hashval ^= (hashval >> 6);
1659
1660                         hashval += val & 0xffff;
1661                         hashval += (hashval << 10);
1662                         hashval ^= (hashval >> 6);
1663                 } else {
1664                         /*
1665                          * This is incredibly painful, but it beats the hell
1666                          * out of the alternative.
1667                          */
1668                         uint64_t j, size = key[i].dttk_size;
1669                         uintptr_t base = (uintptr_t)key[i].dttk_value;
1670
1671                         if (!dtrace_canload(base, size, mstate, vstate))
1672                                 break;
1673
1674                         for (j = 0; j < size; j++) {
1675                                 hashval += dtrace_load8(base + j);
1676                                 hashval += (hashval << 10);
1677                                 hashval ^= (hashval >> 6);
1678                         }
1679                 }
1680         }
1681
1682         if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1683                 return (NULL);
1684
1685         hashval += (hashval << 3);
1686         hashval ^= (hashval >> 11);
1687         hashval += (hashval << 15);
1688
1689         /*
1690          * There is a remote chance (ideally, 1 in 2^31) that our hashval
1691          * comes out to be one of our two sentinel hash values.  If this
1692          * actually happens, we set the hashval to be a value known to be a
1693          * non-sentinel value.
1694          */
1695         if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1696                 hashval = DTRACE_DYNHASH_VALID;
1697
1698         /*
1699          * Yes, it's painful to do a divide here.  If the cycle count becomes
1700          * important here, tricks can be pulled to reduce it.  (However, it's
1701          * critical that hash collisions be kept to an absolute minimum;
1702          * they're much more painful than a divide.)  It's better to have a
1703          * solution that generates few collisions and still keeps things
1704          * relatively simple.
1705          */
1706         bucket = hashval % dstate->dtds_hashsize;
1707
1708         if (op == DTRACE_DYNVAR_DEALLOC) {
1709                 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1710
1711                 for (;;) {
1712                         while ((lock = *lockp) & 1)
1713                                 continue;
1714
1715                         if (dtrace_casptr((void *)(uintptr_t)lockp,
1716                             (void *)lock, (void *)(lock + 1)) == (void *)lock)
1717                                 break;
1718                 }
1719
1720                 dtrace_membar_producer();
1721         }
1722
1723 top:
1724         prev = NULL;
1725         lock = hash[bucket].dtdh_lock;
1726
1727         dtrace_membar_consumer();
1728
1729         start = hash[bucket].dtdh_chain;
1730         ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1731             start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1732             op != DTRACE_DYNVAR_DEALLOC));
1733
1734         for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1735                 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1736                 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1737
1738                 if (dvar->dtdv_hashval != hashval) {
1739                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1740                                 /*
1741                                  * We've reached the sink, and therefore the
1742                                  * end of the hash chain; we can kick out of
1743                                  * the loop knowing that we have seen a valid
1744                                  * snapshot of state.
1745                                  */
1746                                 ASSERT(dvar->dtdv_next == NULL);
1747                                 ASSERT(dvar == &dtrace_dynhash_sink);
1748                                 break;
1749                         }
1750
1751                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1752                                 /*
1753                                  * We've gone off the rails:  somewhere along
1754                                  * the line, one of the members of this hash
1755                                  * chain was deleted.  Note that we could also
1756                                  * detect this by simply letting this loop run
1757                                  * to completion, as we would eventually hit
1758                                  * the end of the dirty list.  However, we
1759                                  * want to avoid running the length of the
1760                                  * dirty list unnecessarily (it might be quite
1761                                  * long), so we catch this as early as
1762                                  * possible by detecting the hash marker.  In
1763                                  * this case, we simply set dvar to NULL and
1764                                  * break; the conditional after the loop will
1765                                  * send us back to top.
1766                                  */
1767                                 dvar = NULL;
1768                                 break;
1769                         }
1770
1771                         goto next;
1772                 }
1773
1774                 if (dtuple->dtt_nkeys != nkeys)
1775                         goto next;
1776
1777                 for (i = 0; i < nkeys; i++, dkey++) {
1778                         if (dkey->dttk_size != key[i].dttk_size)
1779                                 goto next; /* size or type mismatch */
1780
1781                         if (dkey->dttk_size != 0) {
1782                                 if (dtrace_bcmp(
1783                                     (void *)(uintptr_t)key[i].dttk_value,
1784                                     (void *)(uintptr_t)dkey->dttk_value,
1785                                     dkey->dttk_size))
1786                                         goto next;
1787                         } else {
1788                                 if (dkey->dttk_value != key[i].dttk_value)
1789                                         goto next;
1790                         }
1791                 }
1792
1793                 if (op != DTRACE_DYNVAR_DEALLOC)
1794                         return (dvar);
1795
1796                 ASSERT(dvar->dtdv_next == NULL ||
1797                     dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1798
1799                 if (prev != NULL) {
1800                         ASSERT(hash[bucket].dtdh_chain != dvar);
1801                         ASSERT(start != dvar);
1802                         ASSERT(prev->dtdv_next == dvar);
1803                         prev->dtdv_next = dvar->dtdv_next;
1804                 } else {
1805                         if (dtrace_casptr(&hash[bucket].dtdh_chain,
1806                             start, dvar->dtdv_next) != start) {
1807                                 /*
1808                                  * We have failed to atomically swing the
1809                                  * hash table head pointer, presumably because
1810                                  * of a conflicting allocation on another CPU.
1811                                  * We need to reread the hash chain and try
1812                                  * again.
1813                                  */
1814                                 goto top;
1815                         }
1816                 }
1817
1818                 dtrace_membar_producer();
1819
1820                 /*
1821                  * Now set the hash value to indicate that it's free.
1822                  */
1823                 ASSERT(hash[bucket].dtdh_chain != dvar);
1824                 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1825
1826                 dtrace_membar_producer();
1827
1828                 /*
1829                  * Set the next pointer to point at the dirty list, and
1830                  * atomically swing the dirty pointer to the newly freed dvar.
1831                  */
1832                 do {
1833                         next = dcpu->dtdsc_dirty;
1834                         dvar->dtdv_next = next;
1835                 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1836
1837                 /*
1838                  * Finally, unlock this hash bucket.
1839                  */
1840                 ASSERT(hash[bucket].dtdh_lock == lock);
1841                 ASSERT(lock & 1);
1842                 hash[bucket].dtdh_lock++;
1843
1844                 return (NULL);
1845 next:
1846                 prev = dvar;
1847                 continue;
1848         }
1849
1850         if (dvar == NULL) {
1851                 /*
1852                  * If dvar is NULL, it is because we went off the rails:
1853                  * one of the elements that we traversed in the hash chain
1854                  * was deleted while we were traversing it.  In this case,
1855                  * we assert that we aren't doing a dealloc (deallocs lock
1856                  * the hash bucket to prevent themselves from racing with
1857                  * one another), and retry the hash chain traversal.
1858                  */
1859                 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1860                 goto top;
1861         }
1862
1863         if (op != DTRACE_DYNVAR_ALLOC) {
1864                 /*
1865                  * If we are not to allocate a new variable, we want to
1866                  * return NULL now.  Before we return, check that the value
1867                  * of the lock word hasn't changed.  If it has, we may have
1868                  * seen an inconsistent snapshot.
1869                  */
1870                 if (op == DTRACE_DYNVAR_NOALLOC) {
1871                         if (hash[bucket].dtdh_lock != lock)
1872                                 goto top;
1873                 } else {
1874                         ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1875                         ASSERT(hash[bucket].dtdh_lock == lock);
1876                         ASSERT(lock & 1);
1877                         hash[bucket].dtdh_lock++;
1878                 }
1879
1880                 return (NULL);
1881         }
1882
1883         /*
1884          * We need to allocate a new dynamic variable.  The size we need is the
1885          * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1886          * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1887          * the size of any referred-to data (dsize).  We then round the final
1888          * size up to the chunksize for allocation.
1889          */
1890         for (ksize = 0, i = 0; i < nkeys; i++)
1891                 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1892
1893         /*
1894          * This should be pretty much impossible, but could happen if, say,
1895          * strange DIF specified the tuple.  Ideally, this should be an
1896          * assertion and not an error condition -- but that requires that the
1897          * chunksize calculation in dtrace_difo_chunksize() be absolutely
1898          * bullet-proof.  (That is, it must not be able to be fooled by
1899          * malicious DIF.)  Given the lack of backwards branches in DIF,
1900          * solving this would presumably not amount to solving the Halting
1901          * Problem -- but it still seems awfully hard.
1902          */
1903         if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1904             ksize + dsize > chunksize) {
1905                 dcpu->dtdsc_drops++;
1906                 return (NULL);
1907         }
1908
1909         nstate = DTRACE_DSTATE_EMPTY;
1910
1911         do {
1912 retry:
1913                 free = dcpu->dtdsc_free;
1914
1915                 if (free == NULL) {
1916                         dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1917                         void *rval;
1918
1919                         if (clean == NULL) {
1920                                 /*
1921                                  * We're out of dynamic variable space on
1922                                  * this CPU.  Unless we have tried all CPUs,
1923                                  * we'll try to allocate from a different
1924                                  * CPU.
1925                                  */
1926                                 switch (dstate->dtds_state) {
1927                                 case DTRACE_DSTATE_CLEAN: {
1928                                         void *sp = &dstate->dtds_state;
1929
1930                                         if (++cpu >= (int)NCPU)
1931                                                 cpu = 0;
1932
1933                                         if (dcpu->dtdsc_dirty != NULL &&
1934                                             nstate == DTRACE_DSTATE_EMPTY)
1935                                                 nstate = DTRACE_DSTATE_DIRTY;
1936
1937                                         if (dcpu->dtdsc_rinsing != NULL)
1938                                                 nstate = DTRACE_DSTATE_RINSING;
1939
1940                                         dcpu = &dstate->dtds_percpu[cpu];
1941
1942                                         if (cpu != me)
1943                                                 goto retry;
1944
1945                                         (void) dtrace_cas32(sp,
1946                                             DTRACE_DSTATE_CLEAN, nstate);
1947
1948                                         /*
1949                                          * To increment the correct bean
1950                                          * counter, take another lap.
1951                                          */
1952                                         goto retry;
1953                                 }
1954
1955                                 case DTRACE_DSTATE_DIRTY:
1956                                         dcpu->dtdsc_dirty_drops++;
1957                                         break;
1958
1959                                 case DTRACE_DSTATE_RINSING:
1960                                         dcpu->dtdsc_rinsing_drops++;
1961                                         break;
1962
1963                                 case DTRACE_DSTATE_EMPTY:
1964                                         dcpu->dtdsc_drops++;
1965                                         break;
1966                                 }
1967
1968                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1969                                 return (NULL);
1970                         }
1971
1972                         /*
1973                          * The clean list appears to be non-empty.  We want to
1974                          * move the clean list to the free list; we start by
1975                          * moving the clean pointer aside.
1976                          */
1977                         if (dtrace_casptr(&dcpu->dtdsc_clean,
1978                             clean, NULL) != clean) {
1979                                 /*
1980                                  * We are in one of two situations:
1981                                  *
1982                                  *  (a) The clean list was switched to the
1983                                  *      free list by another CPU.
1984                                  *
1985                                  *  (b) The clean list was added to by the
1986                                  *      cleansing cyclic.
1987                                  *
1988                                  * In either of these situations, we can
1989                                  * just reattempt the free list allocation.
1990                                  */
1991                                 goto retry;
1992                         }
1993
1994                         ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1995
1996                         /*
1997                          * Now we'll move the clean list to the free list.
1998                          * It's impossible for this to fail:  the only way
1999                          * the free list can be updated is through this
2000                          * code path, and only one CPU can own the clean list.
2001                          * Thus, it would only be possible for this to fail if
2002                          * this code were racing with dtrace_dynvar_clean().
2003                          * (That is, if dtrace_dynvar_clean() updated the clean
2004                          * list, and we ended up racing to update the free
2005                          * list.)  This race is prevented by the dtrace_sync()
2006                          * in dtrace_dynvar_clean() -- which flushes the
2007                          * owners of the clean lists out before resetting
2008                          * the clean lists.
2009                          */
2010                         rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2011                         ASSERT(rval == NULL);
2012                         goto retry;
2013                 }
2014
2015                 dvar = free;
2016                 new_free = dvar->dtdv_next;
2017         } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2018
2019         /*
2020          * We have now allocated a new chunk.  We copy the tuple keys into the
2021          * tuple array and copy any referenced key data into the data space
2022          * following the tuple array.  As we do this, we relocate dttk_value
2023          * in the final tuple to point to the key data address in the chunk.
2024          */
2025         kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2026         dvar->dtdv_data = (void *)(kdata + ksize);
2027         dvar->dtdv_tuple.dtt_nkeys = nkeys;
2028
2029         for (i = 0; i < nkeys; i++) {
2030                 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2031                 size_t kesize = key[i].dttk_size;
2032
2033                 if (kesize != 0) {
2034                         dtrace_bcopy(
2035                             (const void *)(uintptr_t)key[i].dttk_value,
2036                             (void *)kdata, kesize);
2037                         dkey->dttk_value = kdata;
2038                         kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2039                 } else {
2040                         dkey->dttk_value = key[i].dttk_value;
2041                 }
2042
2043                 dkey->dttk_size = kesize;
2044         }
2045
2046         ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2047         dvar->dtdv_hashval = hashval;
2048         dvar->dtdv_next = start;
2049
2050         if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2051                 return (dvar);
2052
2053         /*
2054          * The cas has failed.  Either another CPU is adding an element to
2055          * this hash chain, or another CPU is deleting an element from this
2056          * hash chain.  The simplest way to deal with both of these cases
2057          * (though not necessarily the most efficient) is to free our
2058          * allocated block and tail-call ourselves.  Note that the free is
2059          * to the dirty list and _not_ to the free list.  This is to prevent
2060          * races with allocators, above.
2061          */
2062         dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2063
2064         dtrace_membar_producer();
2065
2066         do {
2067                 free = dcpu->dtdsc_dirty;
2068                 dvar->dtdv_next = free;
2069         } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2070
2071         return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2072 }
2073
2074 /*ARGSUSED*/
2075 static void
2076 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2077 {
2078 #pragma unused(arg) /* __APPLE__ */
2079         if ((int64_t)nval < (int64_t)*oval)
2080                 *oval = nval;
2081 }
2082
2083 /*ARGSUSED*/
2084 static void
2085 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2086 {
2087 #pragma unused(arg) /* __APPLE__ */
2088         if ((int64_t)nval > (int64_t)*oval)
2089                 *oval = nval;
2090 }
2091
2092 static void
2093 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2094 {
2095         int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2096         int64_t val = (int64_t)nval;
2097
2098         if (val < 0) {
2099                 for (i = 0; i < zero; i++) {
2100                         if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2101                                 quanta[i] += incr;
2102                                 return;
2103                         }
2104                 }
2105         } else {
2106                 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2107                         if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2108                                 quanta[i - 1] += incr;
2109                                 return;
2110                         }
2111                 }
2112
2113                 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2114                 return;
2115         }
2116
2117         ASSERT(0);
2118 }
2119
2120 static void
2121 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2122 {
2123         uint64_t arg = *lquanta++;
2124         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2125         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2126         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2127         int32_t val = (int32_t)nval, level;
2128
2129         ASSERT(step != 0);
2130         ASSERT(levels != 0);
2131
2132         if (val < base) {
2133                 /*
2134                  * This is an underflow.
2135                  */
2136                 lquanta[0] += incr;
2137                 return;
2138         }
2139
2140         level = (val - base) / step;
2141
2142         if (level < levels) {
2143                 lquanta[level + 1] += incr;
2144                 return;
2145         }
2146
2147         /*
2148          * This is an overflow.
2149          */
2150         lquanta[levels + 1] += incr;
2151 }
2152
2153 static int
2154 dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2155                                    int16_t nsteps, int64_t value)
2156 {
2157         int64_t this = 1, last, next;
2158         int base = 1, order;
2159
2160         for (order = 0; order < low; ++order)
2161                 this *= factor;
2162
2163         /*
2164          * If our value is less than our factor taken to the power of the
2165          * low order of magnitude, it goes into the zeroth bucket.
2166          */
2167         if (value < this)
2168                 return 0;
2169         else
2170                 last = this;
2171
2172         for (this *= factor; order <= high; ++order) {
2173                 int nbuckets = this > nsteps ? nsteps : this;
2174
2175                 /*
2176                  * We should not generally get log/linear quantizations
2177                  * with a high magnitude that allows 64-bits to
2178                  * overflow, but we nonetheless protect against this
2179                  * by explicitly checking for overflow, and clamping
2180                  * our value accordingly.
2181                  */
2182                 next = this * factor;
2183                 if (next < this) {
2184                         value = this - 1;
2185                 }
2186
2187                 /*
2188                  * If our value lies within this order of magnitude,
2189                  * determine its position by taking the offset within
2190                  * the order of magnitude, dividing by the bucket
2191                  * width, and adding to our (accumulated) base.
2192                  */
2193                 if (value < this) {
2194                         return (base + (value - last) / (this / nbuckets));
2195                 }
2196
2197                 base += nbuckets - (nbuckets / factor);
2198                 last = this;
2199                 this = next;
2200         }
2201
2202         /*
2203          * Our value is greater than or equal to our factor taken to the
2204          * power of one plus the high magnitude -- return the top bucket.
2205          */
2206         return base;
2207 }
2208
2209 static void
2210 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2211 {
2212         uint64_t arg    = *llquanta++;
2213         uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2214         uint16_t low    = DTRACE_LLQUANTIZE_LOW(arg);
2215         uint16_t high   = DTRACE_LLQUANTIZE_HIGH(arg);
2216         uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2217
2218         llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2219 }
2220
2221 /*ARGSUSED*/
2222 static void
2223 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2224 {
2225 #pragma unused(arg) /* __APPLE__ */
2226         data[0]++;
2227         data[1] += nval;
2228 }
2229
2230 /*ARGSUSED*/
2231 static void
2232 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2233 {
2234 #pragma unused(arg) /* __APPLE__ */
2235         int64_t snval = (int64_t)nval;
2236         uint64_t tmp[2];
2237
2238         data[0]++;
2239         data[1] += nval;
2240
2241         /*
2242          * What we want to say here is:
2243          *
2244          * data[2] += nval * nval;
2245          *
2246          * But given that nval is 64-bit, we could easily overflow, so
2247          * we do this as 128-bit arithmetic.
2248          */
2249         if (snval < 0)
2250                 snval = -snval;
2251
2252         dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2253         dtrace_add_128(data + 2, tmp, data + 2);
2254 }
2255
2256 /*ARGSUSED*/
2257 static void
2258 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2259 {
2260 #pragma unused(nval, arg) /* __APPLE__ */
2261         *oval = *oval + 1;
2262 }
2263
2264 /*ARGSUSED*/
2265 static void
2266 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2267 {
2268 #pragma unused(arg) /* __APPLE__ */
2269         *oval += nval;
2270 }
2271
2272 /*
2273  * Aggregate given the tuple in the principal data buffer, and the aggregating
2274  * action denoted by the specified dtrace_aggregation_t.  The aggregation
2275  * buffer is specified as the buf parameter.  This routine does not return
2276  * failure; if there is no space in the aggregation buffer, the data will be
2277  * dropped, and a corresponding counter incremented.
2278  */
2279 static void
2280 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2281     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2282 {
2283 #pragma unused(arg)
2284         dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2285         uint32_t i, ndx, size, fsize;
2286         uint32_t align = sizeof (uint64_t) - 1;
2287         dtrace_aggbuffer_t *agb;
2288         dtrace_aggkey_t *key;
2289         uint32_t hashval = 0, limit, isstr;
2290         caddr_t tomax, data, kdata;
2291         dtrace_actkind_t action;
2292         dtrace_action_t *act;
2293         uintptr_t offs;
2294
2295         if (buf == NULL)
2296                 return;
2297
2298         if (!agg->dtag_hasarg) {
2299                 /*
2300                  * Currently, only quantize() and lquantize() take additional
2301                  * arguments, and they have the same semantics:  an increment
2302                  * value that defaults to 1 when not present.  If additional
2303                  * aggregating actions take arguments, the setting of the
2304                  * default argument value will presumably have to become more
2305                  * sophisticated...
2306                  */
2307                 arg = 1;
2308         }
2309
2310         action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2311         size = rec->dtrd_offset - agg->dtag_base;
2312         fsize = size + rec->dtrd_size;
2313
2314         ASSERT(dbuf->dtb_tomax != NULL);
2315         data = dbuf->dtb_tomax + offset + agg->dtag_base;
2316
2317         if ((tomax = buf->dtb_tomax) == NULL) {
2318                 dtrace_buffer_drop(buf);
2319                 return;
2320         }
2321
2322         /*
2323          * The metastructure is always at the bottom of the buffer.
2324          */
2325         agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2326             sizeof (dtrace_aggbuffer_t));
2327
2328         if (buf->dtb_offset == 0) {
2329                 /*
2330                  * We just kludge up approximately 1/8th of the size to be
2331                  * buckets.  If this guess ends up being routinely
2332                  * off-the-mark, we may need to dynamically readjust this
2333                  * based on past performance.
2334                  */
2335                 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2336
2337                 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2338                     (uintptr_t)tomax || hashsize == 0) {
2339                         /*
2340                          * We've been given a ludicrously small buffer;
2341                          * increment our drop count and leave.
2342                          */
2343                         dtrace_buffer_drop(buf);
2344                         return;
2345                 }
2346
2347                 /*
2348                  * And now, a pathetic attempt to try to get a an odd (or
2349                  * perchance, a prime) hash size for better hash distribution.
2350                  */
2351                 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2352                         hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2353
2354                 agb->dtagb_hashsize = hashsize;
2355                 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2356                     agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2357                 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2358
2359                 for (i = 0; i < agb->dtagb_hashsize; i++)
2360                         agb->dtagb_hash[i] = NULL;
2361         }
2362
2363         ASSERT(agg->dtag_first != NULL);
2364         ASSERT(agg->dtag_first->dta_intuple);
2365
2366         /*
2367          * Calculate the hash value based on the key.  Note that we _don't_
2368          * include the aggid in the hashing (but we will store it as part of
2369          * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2370          * algorithm: a simple, quick algorithm that has no known funnels, and
2371          * gets good distribution in practice.  The efficacy of the hashing
2372          * algorithm (and a comparison with other algorithms) may be found by
2373          * running the ::dtrace_aggstat MDB dcmd.
2374          */
2375         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2376                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2377                 limit = i + act->dta_rec.dtrd_size;
2378                 ASSERT(limit <= size);
2379                 isstr = DTRACEACT_ISSTRING(act);
2380
2381                 for (; i < limit; i++) {
2382                         hashval += data[i];
2383                         hashval += (hashval << 10);
2384                         hashval ^= (hashval >> 6);
2385
2386                         if (isstr && data[i] == '\0')
2387                                 break;
2388                 }
2389         }
2390
2391         hashval += (hashval << 3);
2392         hashval ^= (hashval >> 11);
2393         hashval += (hashval << 15);
2394
2395         /*
2396          * Yes, the divide here is expensive -- but it's generally the least
2397          * of the performance issues given the amount of data that we iterate
2398          * over to compute hash values, compare data, etc.
2399          */
2400         ndx = hashval % agb->dtagb_hashsize;
2401
2402         for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2403                 ASSERT((caddr_t)key >= tomax);
2404                 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2405
2406                 if (hashval != key->dtak_hashval || key->dtak_size != size)
2407                         continue;
2408
2409                 kdata = key->dtak_data;
2410                 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2411
2412                 for (act = agg->dtag_first; act->dta_intuple;
2413                     act = act->dta_next) {
2414                         i = act->dta_rec.dtrd_offset - agg->dtag_base;
2415                         limit = i + act->dta_rec.dtrd_size;
2416                         ASSERT(limit <= size);
2417                         isstr = DTRACEACT_ISSTRING(act);
2418
2419                         for (; i < limit; i++) {
2420                                 if (kdata[i] != data[i])
2421                                         goto next;
2422
2423                                 if (isstr && data[i] == '\0')
2424                                         break;
2425                         }
2426                 }
2427
2428                 if (action != key->dtak_action) {
2429                         /*
2430                          * We are aggregating on the same value in the same
2431                          * aggregation with two different aggregating actions.
2432                          * (This should have been picked up in the compiler,
2433                          * so we may be dealing with errant or devious DIF.)
2434                          * This is an error condition; we indicate as much,
2435                          * and return.
2436                          */
2437                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2438                         return;
2439                 }
2440
2441                 /*
2442                  * This is a hit:  we need to apply the aggregator to
2443                  * the value at this key.
2444                  */
2445                 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2446                 return;
2447 next:
2448                 continue;
2449         }
2450
2451         /*
2452          * We didn't find it.  We need to allocate some zero-filled space,
2453          * link it into the hash table appropriately, and apply the aggregator
2454          * to the (zero-filled) value.
2455          */
2456         offs = buf->dtb_offset;
2457         while (offs & (align - 1))
2458                 offs += sizeof (uint32_t);
2459
2460         /*
2461          * If we don't have enough room to both allocate a new key _and_
2462          * its associated data, increment the drop count and return.
2463          */
2464         if ((uintptr_t)tomax + offs + fsize >
2465             agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2466                 dtrace_buffer_drop(buf);
2467                 return;
2468         }
2469
2470         /*CONSTCOND*/
2471         ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2472         key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2473         agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2474
2475         key->dtak_data = kdata = tomax + offs;
2476         buf->dtb_offset = offs + fsize;
2477
2478         /*
2479          * Now copy the data across.
2480          */
2481         *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2482
2483         for (i = sizeof (dtrace_aggid_t); i < size; i++)
2484                 kdata[i] = data[i];
2485
2486         /*
2487          * Because strings are not zeroed out by default, we need to iterate
2488          * looking for actions that store strings, and we need to explicitly
2489          * pad these strings out with zeroes.
2490          */
2491         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2492                 int nul;
2493
2494                 if (!DTRACEACT_ISSTRING(act))
2495                         continue;
2496
2497                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2498                 limit = i + act->dta_rec.dtrd_size;
2499                 ASSERT(limit <= size);
2500
2501                 for (nul = 0; i < limit; i++) {
2502                         if (nul) {
2503                                 kdata[i] = '\0';
2504                                 continue;
2505                         }
2506
2507                         if (data[i] != '\0')
2508                                 continue;
2509
2510                         nul = 1;
2511                 }
2512         }
2513
2514         for (i = size; i < fsize; i++)
2515                 kdata[i] = 0;
2516
2517         key->dtak_hashval = hashval;
2518         key->dtak_size = size;
2519         key->dtak_action = action;
2520         key->dtak_next = agb->dtagb_hash[ndx];
2521         agb->dtagb_hash[ndx] = key;
2522
2523         /*
2524          * Finally, apply the aggregator.
2525          */
2526         *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2527         agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2528 }
2529
2530 /*
2531  * Given consumer state, this routine finds a speculation in the INACTIVE
2532  * state and transitions it into the ACTIVE state.  If there is no speculation
2533  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2534  * incremented -- it is up to the caller to take appropriate action.
2535  */
2536 static int
2537 dtrace_speculation(dtrace_state_t *state)
2538 {
2539         int i = 0;
2540         dtrace_speculation_state_t current;
2541         uint32_t *stat = &state->dts_speculations_unavail, count;
2542
2543         while (i < state->dts_nspeculations) {
2544                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2545
2546                 current = spec->dtsp_state;
2547
2548                 if (current != DTRACESPEC_INACTIVE) {
2549                         if (current == DTRACESPEC_COMMITTINGMANY ||
2550                             current == DTRACESPEC_COMMITTING ||
2551                             current == DTRACESPEC_DISCARDING)
2552                                 stat = &state->dts_speculations_busy;
2553                         i++;
2554                         continue;
2555                 }
2556
2557                 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2558                     current, DTRACESPEC_ACTIVE) == current)
2559                         return (i + 1);
2560         }
2561
2562         /*
2563          * We couldn't find a speculation.  If we found as much as a single
2564          * busy speculation buffer, we'll attribute this failure as "busy"
2565          * instead of "unavail".
2566          */
2567         do {
2568                 count = *stat;
2569         } while (dtrace_cas32(stat, count, count + 1) != count);
2570
2571         return (0);
2572 }
2573
2574 /*
2575  * This routine commits an active speculation.  If the specified speculation
2576  * is not in a valid state to perform a commit(), this routine will silently do
2577  * nothing.  The state of the specified speculation is transitioned according
2578  * to the state transition diagram outlined in <sys/dtrace_impl.h>
2579  */
2580 static void
2581 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2582     dtrace_specid_t which)
2583 {
2584         dtrace_speculation_t *spec;
2585         dtrace_buffer_t *src, *dest;
2586         uintptr_t daddr, saddr, dlimit;
2587         dtrace_speculation_state_t current,  new = DTRACESPEC_INACTIVE;
2588         intptr_t offs;
2589
2590         if (which == 0)
2591                 return;
2592
2593         if (which > (dtrace_specid_t)state->dts_nspeculations) {
2594                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2595                 return;
2596         }
2597
2598         spec = &state->dts_speculations[which - 1];
2599         src = &spec->dtsp_buffer[cpu];
2600         dest = &state->dts_buffer[cpu];
2601
2602         do {
2603                 current = spec->dtsp_state;
2604
2605                 if (current == DTRACESPEC_COMMITTINGMANY)
2606                         break;
2607
2608                 switch (current) {
2609                 case DTRACESPEC_INACTIVE:
2610                 case DTRACESPEC_DISCARDING:
2611                         return;
2612
2613                 case DTRACESPEC_COMMITTING:
2614                         /*
2615                          * This is only possible if we are (a) commit()'ing
2616                          * without having done a prior speculate() on this CPU
2617                          * and (b) racing with another commit() on a different
2618                          * CPU.  There's nothing to do -- we just assert that
2619                          * our offset is 0.
2620                          */
2621                         ASSERT(src->dtb_offset == 0);
2622                         return;
2623
2624                 case DTRACESPEC_ACTIVE:
2625                         new = DTRACESPEC_COMMITTING;
2626                         break;
2627
2628                 case DTRACESPEC_ACTIVEONE:
2629                         /*
2630                          * This speculation is active on one CPU.  If our
2631                          * buffer offset is non-zero, we know that the one CPU
2632                          * must be us.  Otherwise, we are committing on a
2633                          * different CPU from the speculate(), and we must
2634                          * rely on being asynchronously cleaned.
2635                          */
2636                         if (src->dtb_offset != 0) {
2637                                 new = DTRACESPEC_COMMITTING;
2638                                 break;
2639                         }
2640                         /*FALLTHROUGH*/
2641
2642                 case DTRACESPEC_ACTIVEMANY:
2643                         new = DTRACESPEC_COMMITTINGMANY;
2644                         break;
2645
2646                 default:
2647                         ASSERT(0);
2648                 }
2649         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2650             current, new) != current);
2651
2652         /*
2653          * We have set the state to indicate that we are committing this
2654          * speculation.  Now reserve the necessary space in the destination
2655          * buffer.
2656          */
2657         if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2658             sizeof (uint64_t), state, NULL)) < 0) {
2659                 dtrace_buffer_drop(dest);
2660                 goto out;
2661         }
2662
2663         /*
2664          * We have the space; copy the buffer across.  (Note that this is a
2665          * highly subobtimal bcopy(); in the unlikely event that this becomes
2666          * a serious performance issue, a high-performance DTrace-specific
2667          * bcopy() should obviously be invented.)
2668          */
2669         daddr = (uintptr_t)dest->dtb_tomax + offs;
2670         dlimit = daddr + src->dtb_offset;
2671         saddr = (uintptr_t)src->dtb_tomax;
2672
2673         /*
2674          * First, the aligned portion.
2675          */
2676         while (dlimit - daddr >= sizeof (uint64_t)) {
2677                 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2678
2679                 daddr += sizeof (uint64_t);
2680                 saddr += sizeof (uint64_t);
2681         }
2682
2683         /*
2684          * Now any left-over bit...
2685          */
2686         while (dlimit - daddr)
2687                 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2688
2689         /*
2690          * Finally, commit the reserved space in the destination buffer.
2691          */
2692         dest->dtb_offset = offs + src->dtb_offset;
2693
2694 out:
2695         /*
2696          * If we're lucky enough to be the only active CPU on this speculation
2697          * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2698          */
2699         if (current == DTRACESPEC_ACTIVE ||
2700             (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2701                 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2702                     DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2703 #pragma unused(rval) /* __APPLE__ */
2704
2705                 ASSERT(rval == DTRACESPEC_COMMITTING);
2706         }
2707
2708         src->dtb_offset = 0;
2709         src->dtb_xamot_drops += src->dtb_drops;
2710         src->dtb_drops = 0;
2711 }
2712
2713 /*
2714  * This routine discards an active speculation.  If the specified speculation
2715  * is not in a valid state to perform a discard(), this routine will silently
2716  * do nothing.  The state of the specified speculation is transitioned
2717  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2718  */
2719 static void
2720 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2721     dtrace_specid_t which)
2722 {
2723         dtrace_speculation_t *spec;
2724         dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2725         dtrace_buffer_t *buf;
2726
2727         if (which == 0)
2728                 return;
2729
2730         if (which > (dtrace_specid_t)state->dts_nspeculations) {
2731                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2732                 return;
2733         }
2734
2735         spec = &state->dts_speculations[which - 1];
2736         buf = &spec->dtsp_buffer[cpu];
2737
2738         do {
2739                 current = spec->dtsp_state;
2740
2741                 switch (current) {
2742                 case DTRACESPEC_INACTIVE:
2743                 case DTRACESPEC_COMMITTINGMANY:
2744                 case DTRACESPEC_COMMITTING:
2745                 case DTRACESPEC_DISCARDING:
2746                         return;
2747
2748                 case DTRACESPEC_ACTIVE:
2749                 case DTRACESPEC_ACTIVEMANY:
2750                         new = DTRACESPEC_DISCARDING;
2751                         break;
2752
2753                 case DTRACESPEC_ACTIVEONE:
2754                         if (buf->dtb_offset != 0) {
2755                                 new = DTRACESPEC_INACTIVE;
2756                         } else {
2757                                 new = DTRACESPEC_DISCARDING;
2758                         }
2759                         break;
2760
2761                 default:
2762                         ASSERT(0);
2763                 }
2764         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2765             current, new) != current);
2766
2767         buf->dtb_offset = 0;
2768         buf->dtb_drops = 0;
2769 }
2770
2771 /*
2772  * Note:  not called from probe context.  This function is called
2773  * asynchronously from cross call context to clean any speculations that are
2774  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2775  * transitioned back to the INACTIVE state until all CPUs have cleaned the
2776  * speculation.
2777  */
2778 static void
2779 dtrace_speculation_clean_here(dtrace_state_t *state)
2780 {
2781         dtrace_icookie_t cookie;
2782         processorid_t cpu = CPU->cpu_id;
2783         dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2784         dtrace_specid_t i;
2785
2786         cookie = dtrace_interrupt_disable();
2787
2788         if (dest->dtb_tomax == NULL) {
2789                 dtrace_interrupt_enable(cookie);
2790                 return;
2791         }
2792
2793         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2794                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2795                 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2796
2797                 if (src->dtb_tomax == NULL)
2798                         continue;
2799
2800                 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2801                         src->dtb_offset = 0;
2802                         continue;
2803                 }
2804
2805                 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2806                         continue;
2807
2808                 if (src->dtb_offset == 0)
2809                         continue;
2810
2811                 dtrace_speculation_commit(state, cpu, i + 1);
2812         }
2813
2814         dtrace_interrupt_enable(cookie);
2815 }
2816
2817 /*
2818  * Note:  not called from probe context.  This function is called
2819  * asynchronously (and at a regular interval) to clean any speculations that
2820  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2821  * is work to be done, it cross calls all CPUs to perform that work;
2822  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2823  * INACTIVE state until they have been cleaned by all CPUs.
2824  */
2825 static void
2826 dtrace_speculation_clean(dtrace_state_t *state)
2827 {
2828         int work = 0;
2829         uint32_t rv;
2830         dtrace_specid_t i;
2831
2832         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2833                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2834
2835                 ASSERT(!spec->dtsp_cleaning);
2836
2837                 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2838                     spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2839                         continue;
2840
2841                 work++;
2842                 spec->dtsp_cleaning = 1;
2843         }
2844
2845         if (!work)
2846                 return;
2847
2848         dtrace_xcall(DTRACE_CPUALL,
2849             (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2850
2851         /*
2852          * We now know that all CPUs have committed or discarded their
2853          * speculation buffers, as appropriate.  We can now set the state
2854          * to inactive.
2855          */
2856         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2857                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2858                 dtrace_speculation_state_t current, new;
2859
2860                 if (!spec->dtsp_cleaning)
2861                         continue;
2862
2863                 current = spec->dtsp_state;
2864                 ASSERT(current == DTRACESPEC_DISCARDING ||
2865                     current == DTRACESPEC_COMMITTINGMANY);
2866
2867                 new = DTRACESPEC_INACTIVE;
2868
2869                 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2870                 ASSERT(rv == current);
2871                 spec->dtsp_cleaning = 0;
2872         }
2873 }
2874
2875 /*
2876  * Called as part of a speculate() to get the speculative buffer associated
2877  * with a given speculation.  Returns NULL if the specified speculation is not
2878  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2879  * the active CPU is not the specified CPU -- the speculation will be
2880  * atomically transitioned into the ACTIVEMANY state.
2881  */
2882 static dtrace_buffer_t *
2883 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2884     dtrace_specid_t which)
2885 {
2886         dtrace_speculation_t *spec;
2887         dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2888         dtrace_buffer_t *buf;
2889
2890         if (which == 0)
2891                 return (NULL);
2892
2893         if (which > (dtrace_specid_t)state->dts_nspeculations) {
2894                 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2895                 return (NULL);
2896         }
2897
2898         spec = &state->dts_speculations[which - 1];
2899         buf = &spec->dtsp_buffer[cpuid];
2900
2901         do {
2902                 current = spec->dtsp_state;
2903
2904                 switch (current) {
2905                 case DTRACESPEC_INACTIVE:
2906                 case DTRACESPEC_COMMITTINGMANY:
2907                 case DTRACESPEC_DISCARDING:
2908                         return (NULL);
2909
2910                 case DTRACESPEC_COMMITTING:
2911                         ASSERT(buf->dtb_offset == 0);
2912                         return (NULL);
2913
2914                 case DTRACESPEC_ACTIVEONE:
2915                         /*
2916                          * This speculation is currently active on one CPU.
2917                          * Check the offset in the buffer; if it's non-zero,
2918                          * that CPU must be us (and we leave the state alone).
2919                          * If it's zero, assume that we're starting on a new
2920                          * CPU -- and change the state to indicate that the
2921                          * speculation is active on more than one CPU.
2922                          */
2923                         if (buf->dtb_offset != 0)
2924                                 return (buf);
2925
2926                         new = DTRACESPEC_ACTIVEMANY;
2927                         break;
2928
2929                 case DTRACESPEC_ACTIVEMANY:
2930                         return (buf);
2931
2932                 case DTRACESPEC_ACTIVE:
2933                         new = DTRACESPEC_ACTIVEONE;
2934                         break;
2935
2936                 default:
2937                         ASSERT(0);
2938                 }
2939         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2940             current, new) != current);
2941
2942         ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2943         return (buf);
2944 }
2945
2946 /*
2947  * Return a string.  In the event that the user lacks the privilege to access
2948  * arbitrary kernel memory, we copy the string out to scratch memory so that we
2949  * don't fail access checking.
2950  *
2951  * dtrace_dif_variable() uses this routine as a helper for various
2952  * builtin values such as 'execname' and 'probefunc.'
2953  */
2954 static
2955 uintptr_t
2956 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2957     dtrace_mstate_t *mstate)
2958 {
2959         uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2960         uintptr_t ret;
2961         size_t strsz;
2962
2963         /*
2964          * The easy case: this probe is allowed to read all of memory, so
2965          * we can just return this as a vanilla pointer.
2966          */
2967         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2968                 return (addr);
2969
2970         /*
2971          * This is the tougher case: we copy the string in question from
2972          * kernel memory into scratch memory and return it that way: this
2973          * ensures that we won't trip up when access checking tests the
2974          * BYREF return value.
2975          */
2976         strsz = dtrace_strlen((char *)addr, size) + 1;
2977
2978         if (mstate->dtms_scratch_ptr + strsz >
2979             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2980                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2981                 return (0);
2982         }
2983
2984         dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2985             strsz);
2986         ret = mstate->dtms_scratch_ptr;
2987         mstate->dtms_scratch_ptr += strsz;
2988         return (ret);
2989 }
2990
2991 /*
2992  * This function implements the DIF emulator's variable lookups.  The emulator
2993  * passes a reserved variable identifier and optional built-in array index.
2994  */
2995 static uint64_t
2996 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2997     uint64_t ndx)
2998 {
2999         /*
3000          * If we're accessing one of the uncached arguments, we'll turn this
3001          * into a reference in the args array.
3002          */
3003         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3004                 ndx = v - DIF_VAR_ARG0;
3005                 v = DIF_VAR_ARGS;
3006         }
3007
3008         switch (v) {
3009         case DIF_VAR_ARGS:
3010                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3011                 if (ndx >= sizeof (mstate->dtms_arg) /
3012                     sizeof (mstate->dtms_arg[0])) {
3013                         /*
3014                          * APPLE NOTE: Account for introduction of __dtrace_probe()
3015                          */
3016                         int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3017                         dtrace_provider_t *pv;
3018                         uint64_t val;
3019
3020                         pv = mstate->dtms_probe->dtpr_provider;
3021                         if (pv->dtpv_pops.dtps_getargval != NULL)
3022                                 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3023                                     mstate->dtms_probe->dtpr_id,
3024                                     mstate->dtms_probe->dtpr_arg, ndx, aframes);
3025                         /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
3026                         else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
3027                                 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
3028                         }
3029
3030                         else
3031                                 val = dtrace_getarg(ndx, aframes);
3032
3033                         /*
3034                          * This is regrettably required to keep the compiler
3035                          * from tail-optimizing the call to dtrace_getarg().
3036                          * The condition always evaluates to true, but the
3037                          * compiler has no way of figuring that out a priori.
3038                          * (None of this would be necessary if the compiler
3039                          * could be relied upon to _always_ tail-optimize
3040                          * the call to dtrace_getarg() -- but it can't.)
3041                          */
3042                         if (mstate->dtms_probe != NULL)
3043                                 return (val);
3044
3045                         ASSERT(0);
3046                 }
3047
3048                 return (mstate->dtms_arg[ndx]);
3049
3050         case DIF_VAR_UREGS: {
3051                 thread_t thread;
3052
3053                 if (!dtrace_priv_proc(state))
3054                         return (0);
3055
3056                 if ((thread = current_thread()) == NULL) {
3057                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3058                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3059                         return (0);
3060                 }
3061
3062                 return (dtrace_getreg(find_user_regs(thread), ndx));
3063         }
3064
3065
3066         case DIF_VAR_CURTHREAD:
3067                 if (!dtrace_priv_kernel(state))
3068                         return (0);
3069
3070                 return ((uint64_t)(uintptr_t)current_thread());
3071
3072         case DIF_VAR_TIMESTAMP:
3073                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3074                         mstate->dtms_timestamp = dtrace_gethrtime();
3075                         mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3076                 }
3077                 return (mstate->dtms_timestamp);
3078
3079         case DIF_VAR_VTIMESTAMP:
3080                 ASSERT(dtrace_vtime_references != 0);
3081                 return (dtrace_get_thread_vtime(current_thread()));
3082
3083         case DIF_VAR_WALLTIMESTAMP:
3084                 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3085                         mstate->dtms_walltimestamp = dtrace_gethrestime();
3086                         mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3087                 }
3088                 return (mstate->dtms_walltimestamp);
3089
3090         case DIF_VAR_MACHTIMESTAMP:
3091                 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3092                         mstate->dtms_machtimestamp = mach_absolute_time();
3093                         mstate->dtms_present |= DTRACE_MSTATE_MACHTIMESTAMP;
3094                 }
3095                 return (mstate->dtms_machtimestamp);
3096
3097         case DIF_VAR_IPL:
3098                 if (!dtrace_priv_kernel(state))
3099                         return (0);
3100                 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3101                         mstate->dtms_ipl = dtrace_getipl();
3102                         mstate->dtms_present |= DTRACE_MSTATE_IPL;
3103                 }
3104                 return (mstate->dtms_ipl);
3105
3106         case DIF_VAR_EPID:
3107                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3108                 return (mstate->dtms_epid);
3109
3110         case DIF_VAR_ID:
3111                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3112                 return (mstate->dtms_probe->dtpr_id);
3113
3114         case DIF_VAR_STACKDEPTH:
3115                 if (!dtrace_priv_kernel(state))
3116                         return (0);
3117                 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3118                         /*
3119                          * APPLE NOTE: Account for introduction of __dtrace_probe()
3120                          */
3121                         int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3122
3123                         mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3124                         mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3125                 }
3126                 return (mstate->dtms_stackdepth);
3127
3128         case DIF_VAR_USTACKDEPTH:
3129                 if (!dtrace_priv_proc(state))
3130                         return (0);
3131                 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3132                         /*
3133                          * See comment in DIF_VAR_PID.
3134                          */
3135                         if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3136                             CPU_ON_INTR(CPU)) {
3137                                 mstate->dtms_ustackdepth = 0;
3138                         } else {
3139                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3140                                 mstate->dtms_ustackdepth =
3141                                     dtrace_getustackdepth();
3142                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3143                         }
3144                         mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3145                 }
3146                 return (mstate->dtms_ustackdepth);
3147
3148         case DIF_VAR_CALLER:
3149                 if (!dtrace_priv_kernel(state))
3150                         return (0);
3151                 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3152                         /*
3153                          * APPLE NOTE: Account for introduction of __dtrace_probe()
3154                          */
3155                         int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3156
3157                         if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3158                                 /*
3159                                  * If this is an unanchored probe, we are
3160                                  * required to go through the slow path:
3161                                  * dtrace_caller() only guarantees correct
3162                                  * results for anchored probes.
3163                                  */
3164                                 pc_t caller[2];
3165
3166                                 dtrace_getpcstack(caller, 2, aframes,
3167                                     (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3168                                 mstate->dtms_caller = caller[1];
3169                         } else if ((mstate->dtms_caller =
3170                                 dtrace_caller(aframes)) == (uintptr_t)-1) {
3171                                 /*
3172                                  * We have failed to do this the quick way;
3173                                  * we must resort to the slower approach of
3174                                  * calling dtrace_getpcstack().
3175                                  */
3176                                 pc_t caller;
3177
3178                                 dtrace_getpcstack(&caller, 1, aframes, NULL);
3179                                 mstate->dtms_caller = caller;
3180                         }
3181
3182                         mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3183                 }
3184                 return (mstate->dtms_caller);
3185
3186         case DIF_VAR_UCALLER:
3187                 if (!dtrace_priv_proc(state))
3188                         return (0);
3189
3190                 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3191                         uint64_t ustack[3];
3192
3193                         /*
3194                          * dtrace_getupcstack() fills in the first uint64_t
3195                          * with the current PID.  The second uint64_t will
3196                          * be the program counter at user-level.  The third
3197                          * uint64_t will contain the caller, which is what
3198                          * we're after.
3199                          */
3200                         ustack[2] = 0;
3201                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3202                         dtrace_getupcstack(ustack, 3);
3203                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3204                         mstate->dtms_ucaller = ustack[2];
3205                         mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3206                 }
3207
3208                 return (mstate->dtms_ucaller);
3209
3210         case DIF_VAR_PROBEPROV:
3211                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3212                 return (dtrace_dif_varstr(
3213                     (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3214                     state, mstate));
3215
3216         case DIF_VAR_PROBEMOD:
3217                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3218                 return (dtrace_dif_varstr(
3219                     (uintptr_t)mstate->dtms_probe->dtpr_mod,
3220                     state, mstate));
3221
3222         case DIF_VAR_PROBEFUNC:
3223                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3224                 return (dtrace_dif_varstr(
3225                     (uintptr_t)mstate->dtms_probe->dtpr_func,
3226                     state, mstate));
3227
3228         case DIF_VAR_PROBENAME:
3229                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3230                 return (dtrace_dif_varstr(
3231                     (uintptr_t)mstate->dtms_probe->dtpr_name,
3232                     state, mstate));
3233
3234         case DIF_VAR_PID:
3235                 if (!dtrace_priv_proc_relaxed(state))
3236                         return (0);
3237
3238                 /*
3239                  * Note that we are assuming that an unanchored probe is
3240                  * always due to a high-level interrupt.  (And we're assuming
3241                  * that there is only a single high level interrupt.)
3242                  */
3243                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3244                         /* Anchored probe that fires while on an interrupt accrues to process 0 */
3245                         return 0;
3246
3247                 return ((uint64_t)dtrace_proc_selfpid());
3248
3249         case DIF_VAR_PPID:
3250                 if (!dtrace_priv_proc_relaxed(state))
3251                         return (0);
3252
3253                 /*
3254                  * See comment in DIF_VAR_PID.
3255                  */
3256                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3257                         return (0);
3258
3259                 return ((uint64_t)dtrace_proc_selfppid());
3260
3261         case DIF_VAR_TID:
3262                 /* We do not need to check for null current_thread() */
3263                 return thread_tid(current_thread()); /* globally unique */
3264
3265         case DIF_VAR_PTHREAD_SELF:
3266                 if (!dtrace_priv_proc(state))
3267                         return (0);
3268
3269                 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3270                 return 0;
3271
3272         case DIF_VAR_DISPATCHQADDR:
3273                 if (!dtrace_priv_proc(state))
3274                         return (0);
3275
3276                 /* We do not need to check for null current_thread() */
3277                 return thread_dispatchqaddr(current_thread());
3278
3279         case DIF_VAR_EXECNAME:
3280         {
3281                 char *xname = (char *)mstate->dtms_scratch_ptr;
3282                 size_t scratch_size = MAXCOMLEN+1;
3283
3284                 /* The scratch allocation's lifetime is that of the clause. */
3285                 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3286                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3287                         return 0;
3288                 }
3289
3290                 if (!dtrace_priv_proc_relaxed(state))
3291                         return (0);
3292
3293                 mstate->dtms_scratch_ptr += scratch_size;
3294                 proc_selfname( xname, MAXCOMLEN );
3295
3296                 return ((uint64_t)(uintptr_t)xname);
3297         }
3298
3299
3300         case DIF_VAR_ZONENAME:
3301         {
3302                 /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3303                 char *zname = (char *)mstate->dtms_scratch_ptr;
3304                 size_t scratch_size = 6 + 1;
3305
3306                 if (!dtrace_priv_proc(state))
3307                         return (0);
3308
3309                 /* The scratch allocation's lifetime is that of the clause. */
3310                 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3311                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3312                         return 0;
3313                 }
3314
3315                 mstate->dtms_scratch_ptr += scratch_size;
3316
3317                 /* The kernel does not provide zonename, it will always return 'global'. */
3318                 strlcpy(zname, "global", scratch_size);
3319
3320                 return ((uint64_t)(uintptr_t)zname);
3321         }
3322
3323         case DIF_VAR_UID:
3324                 if (!dtrace_priv_proc_relaxed(state))
3325                         return (0);
3326
3327                 /*
3328                  * See comment in DIF_VAR_PID.
3329                  */
3330                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3331                         return (0);
3332
3333                 return ((uint64_t) dtrace_proc_selfruid());
3334
3335         case DIF_VAR_GID:
3336                 if (!dtrace_priv_proc(state))
3337                         return (0);
3338
3339                 /*
3340                  * See comment in DIF_VAR_PID.
3341                  */
3342                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3343                         return (0);
3344
3345                 if (dtrace_CRED() != NULL)
3346                         /* Credential does not require lazy initialization. */
3347                         return ((uint64_t)kauth_getgid());
3348                 else {
3349                         /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3350                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3351                         return -1ULL;
3352                 }
3353
3354         case DIF_VAR_ERRNO: {
3355                 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3356                 if (!dtrace_priv_proc(state))
3357                         return (0);
3358
3359                 /*
3360                  * See comment in DIF_VAR_PID.
3361                  */
3362                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3363                         return (0);
3364
3365                 if (uthread)
3366                         return (uint64_t)uthread->t_dtrace_errno;
3367                 else {
3368                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3369                         return -1ULL;
3370                 }
3371         }
3372
3373         default:
3374                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3375                 return (0);
3376         }
3377 }
3378
3379 /*
3380  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3381  * Notice that we don't bother validating the proper number of arguments or
3382  * their types in the tuple stack.  This isn't needed because all argument
3383  * interpretation is safe because of our load safety -- the worst that can
3384  * happen is that a bogus program can obtain bogus results.
3385  */
3386 static void
3387 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3388     dtrace_key_t *tupregs, int nargs,
3389     dtrace_mstate_t *mstate, dtrace_state_t *state)
3390 {
3391         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
3392         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3393         dtrace_vstate_t *vstate = &state->dts_vstate;
3394
3395 #if !defined(__APPLE__)
3396         union {
3397                 mutex_impl_t mi;
3398                 uint64_t mx;
3399         } m;
3400
3401         union {
3402                 krwlock_t ri;
3403                 uintptr_t rw;
3404         } r;
3405 #else
3406 /* FIXME: awaits lock/mutex work */
3407 #endif /* __APPLE__ */
3408
3409         switch (subr) {
3410         case DIF_SUBR_RAND:
3411                 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3412                 break;
3413
3414 #if !defined(__APPLE__)
3415         case DIF_SUBR_MUTEX_OWNED:
3416                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3417                     mstate, vstate)) {
3418                         regs[rd] = 0;
3419                         break;
3420                 }
3421
3422                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3423                 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3424                         regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3425                 else
3426                         regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3427                 break;
3428
3429         case DIF_SUBR_MUTEX_OWNER:
3430                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3431                     mstate, vstate)) {
3432                         regs[rd] = 0;
3433                         break;
3434                 }
3435
3436                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3437                 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3438                     MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3439                         regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3440                 else
3441                         regs[rd] = 0;
3442                 break;
3443
3444         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3445                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3446                     mstate, vstate)) {
3447                         regs[rd] = 0;
3448                         break;
3449                 }
3450
3451                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3452                 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3453                 break;
3454
3455         case DIF_SUBR_MUTEX_TYPE_SPIN:
3456                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3457                     mstate, vstate)) {
3458                         regs[rd] = 0;
3459                         break;
3460                 }
3461
3462                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3463                 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3464                 break;
3465
3466         case DIF_SUBR_RW_READ_HELD: {
3467                 uintptr_t tmp;
3468
3469                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3470                     mstate, vstate)) {
3471                         regs[rd] = 0;
3472                         break;
3473                 }
3474
3475                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3476                 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3477                 break;
3478         }
3479
3480         case DIF_SUBR_RW_WRITE_HELD:
3481                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3482                     mstate, vstate)) {
3483                         regs[rd] = 0;
3484                         break;
3485                 }
3486
3487                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3488                 regs[rd] = _RW_WRITE_HELD(&r.ri);
3489                 break;
3490
3491         case DIF_SUBR_RW_ISWRITER:
3492                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3493                     mstate, vstate)) {
3494                         regs[rd] = 0;
3495                         break;
3496                 }
3497
3498                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3499                 regs[rd] = _RW_ISWRITER(&r.ri);
3500                 break;
3501 #else
3502 /* FIXME: awaits lock/mutex work */
3503 #endif /* __APPLE__ */
3504
3505         case DIF_SUBR_BCOPY: {
3506                 /*
3507                  * We need to be sure that the destination is in the scratch
3508                  * region -- no other region is allowed.
3509                  */
3510                 uintptr_t src = tupregs[0].dttk_value;
3511                 uintptr_t dest = tupregs[1].dttk_value;
3512                 size_t size = tupregs[2].dttk_value;
3513
3514                 if (!dtrace_inscratch(dest, size, mstate)) {
3515                         *flags |= CPU_DTRACE_BADADDR;
3516                         *illval = regs[rd];
3517                         break;
3518                 }
3519
3520                 if (!dtrace_canload(src, size, mstate, vstate)) {
3521                         regs[rd] = 0;
3522                         break;
3523                 }
3524
3525                 dtrace_bcopy((void *)src, (void *)dest, size);
3526                 break;
3527         }
3528
3529         case DIF_SUBR_ALLOCA:
3530         case DIF_SUBR_COPYIN: {
3531                 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3532                 uint64_t size =
3533                     tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3534                 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3535
3536                 /*
3537                  * This action doesn't require any credential checks since
3538                  * probes will not activate in user contexts to which the
3539                  * enabling user does not have permissions.
3540                  */
3541
3542                 /*
3543                  * Rounding up the user allocation size could have overflowed
3544                  * a large, bogus allocation (like -1ULL) to 0.
3545                  */
3546                 if (scratch_size < size ||
3547                     !DTRACE_INSCRATCH(mstate, scratch_size)) {
3548                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3549                         regs[rd] = 0;
3550                         break;
3551                 }
3552
3553                 if (subr == DIF_SUBR_COPYIN) {
3554                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3555                         if (dtrace_priv_proc(state))
3556                                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3557                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3558                 }
3559
3560                 mstate->dtms_scratch_ptr += scratch_size;
3561                 regs[rd] = dest;
3562                 break;
3563         }
3564
3565         case DIF_SUBR_COPYINTO: {
3566                 uint64_t size = tupregs[1].dttk_value;
3567                 uintptr_t dest = tupregs[2].dttk_value;
3568
3569                 /*
3570                  * This action doesn't require any credential checks since
3571                  * probes will not activate in user contexts to which the
3572                  * enabling user does not have permissions.
3573                  */
3574                 if (!dtrace_inscratch(dest, size, mstate)) {
3575                         *flags |= CPU_DTRACE_BADADDR;
3576                         *illval = regs[rd];
3577                         break;
3578                 }
3579
3580                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3581                 if (dtrace_priv_proc(state))
3582                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3583                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3584                 break;
3585         }
3586
3587         case DIF_SUBR_COPYINSTR: {
3588                 uintptr_t dest = mstate->dtms_scratch_ptr;
3589                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3590
3591                 if (nargs > 1 && tupregs[1].dttk_value < size)
3592                         size = tupregs[1].dttk_value + 1;
3593
3594                 /*
3595                  * This action doesn't require any credential checks since
3596                  * probes will not activate in user contexts to which the
3597                  * enabling user does not have permissions.
3598                  */
3599                 if (!DTRACE_INSCRATCH(mstate, size)) {
3600                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3601                         regs[rd] = 0;
3602                         break;
3603                 }
3604
3605                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3606                 if (dtrace_priv_proc(state))
3607                         dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3608                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3609
3610                 ((char *)dest)[size - 1] = '\0';
3611                 mstate->dtms_scratch_ptr += size;
3612                 regs[rd] = dest;
3613                 break;
3614         }
3615
3616         case DIF_SUBR_MSGSIZE:
3617         case DIF_SUBR_MSGDSIZE: {
3618                 /* Darwin does not implement SysV streams messages */
3619                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3620                 regs[rd] = 0;
3621                 break;
3622         }
3623
3624         case DIF_SUBR_PROGENYOF: {
3625                 pid_t pid = tupregs[0].dttk_value;
3626                 struct proc *p = current_proc();
3627                 int rval = 0, lim = nprocs;
3628
3629                 while(p && (lim-- > 0)) {
3630                         pid_t ppid;
3631
3632                         ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
3633                         if (*flags & CPU_DTRACE_FAULT)
3634                                 break;
3635
3636                         if (ppid == pid) {
3637                                 rval = 1;
3638                                 break;
3639                         }
3640
3641                         if (ppid == 0)
3642                                 break; /* Can't climb process tree any further. */
3643
3644                         p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
3645                         if (*flags & CPU_DTRACE_FAULT)
3646                                 break;
3647                 }
3648
3649                 regs[rd] = rval;
3650                 break;
3651         }
3652
3653         case DIF_SUBR_SPECULATION:
3654                 regs[rd] = dtrace_speculation(state);
3655                 break;
3656
3657
3658         case DIF_SUBR_COPYOUT: {
3659                 uintptr_t kaddr = tupregs[0].dttk_value;
3660                 user_addr_t uaddr = tupregs[1].dttk_value;
3661                 uint64_t size = tupregs[2].dttk_value;
3662
3663                 if (!dtrace_destructive_disallow &&
3664                     dtrace_priv_proc_control(state) &&
3665                     !dtrace_istoxic(kaddr, size)) {
3666                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3667                         dtrace_copyout(kaddr, uaddr, size, flags);
3668                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3669                 }
3670                 break;
3671         }
3672
3673         case DIF_SUBR_COPYOUTSTR: {
3674                 uintptr_t kaddr = tupregs[0].dttk_value;
3675                 user_addr_t uaddr = tupregs[1].dttk_value;
3676                 uint64_t size = tupregs[2].dttk_value;
3677
3678                 if (!dtrace_destructive_disallow &&
3679                     dtrace_priv_proc_control(state) &&
3680                     !dtrace_istoxic(kaddr, size)) {
3681                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3682                         dtrace_copyoutstr(kaddr, uaddr, size, flags);
3683                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3684                 }
3685                 break;
3686         }
3687
3688         case DIF_SUBR_STRLEN: {
3689                 size_t sz;
3690                 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3691                 sz = dtrace_strlen((char *)addr,
3692                     state->dts_options[DTRACEOPT_STRSIZE]);
3693
3694                 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3695                         regs[rd] = 0;
3696                         break;
3697                 }
3698
3699                 regs[rd] = sz;
3700
3701                 break;
3702         }
3703
3704         case DIF_SUBR_STRCHR:
3705         case DIF_SUBR_STRRCHR: {
3706                 /*
3707                  * We're going to iterate over the string looking for the
3708                  * specified character.  We will iterate until we have reached
3709                  * the string length or we have found the character.  If this
3710                  * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3711                  * of the specified character instead of the first.
3712                  */
3713                 uintptr_t saddr = tupregs[0].dttk_value;
3714                 uintptr_t addr = tupregs[0].dttk_value;
3715                 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3716                 char c, target = (char)tupregs[1].dttk_value;
3717
3718                 for (regs[rd] = 0; addr < limit; addr++) {
3719                         if ((c = dtrace_load8(addr)) == target) {
3720                                 regs[rd] = addr;
3721
3722                                 if (subr == DIF_SUBR_STRCHR)
3723                                         break;
3724                         }
3725
3726                         if (c == '\0')
3727                                 break;
3728                 }
3729
3730                 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3731                         regs[rd] = 0;
3732                         break;
3733                 }
3734
3735                 break;
3736         }
3737
3738         case DIF_SUBR_STRSTR:
3739         case DIF_SUBR_INDEX:
3740         case DIF_SUBR_RINDEX: {
3741                 /*
3742                  * We're going to iterate over the string looking for the
3743                  * specified string.  We will iterate until we have reached
3744                  * the string length or we have found the string.  (Yes, this
3745                  * is done in the most naive way possible -- but considering
3746                  * that the string we're searching for is likely to be
3747                  * relatively short, the complexity of Rabin-Karp or similar
3748                  * hardly seems merited.)
3749                  */
3750                 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3751                 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3752                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3753                 size_t len = dtrace_strlen(addr, size);
3754                 size_t sublen = dtrace_strlen(substr, size);
3755                 char *limit = addr + len, *orig = addr;
3756                 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3757                 int inc = 1;
3758
3759                 regs[rd] = notfound;
3760
3761                 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3762                         regs[rd] = 0;
3763                         break;
3764                 }
3765
3766                 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3767                     vstate)) {
3768                         regs[rd] = 0;
3769                         break;
3770                 }
3771
3772                 /*
3773                  * strstr() and index()/rindex() have similar semantics if
3774                  * both strings are the empty string: strstr() returns a
3775                  * pointer to the (empty) string, and index() and rindex()
3776                  * both return index 0 (regardless of any position argument).
3777                  */
3778                 if (sublen == 0 && len == 0) {
3779                         if (subr == DIF_SUBR_STRSTR)
3780                                 regs[rd] = (uintptr_t)addr;
3781                         else
3782                                 regs[rd] = 0;
3783                         break;
3784                 }
3785
3786                 if (subr != DIF_SUBR_STRSTR) {
3787                         if (subr == DIF_SUBR_RINDEX) {
3788                                 limit = orig - 1;
3789                                 addr += len;
3790                                 inc = -1;
3791                         }
3792
3793                         /*
3794                          * Both index() and rindex() take an optional position
3795                          * argument that denotes the starting position.
3796                          */
3797                         if (nargs == 3) {
3798                                 int64_t pos = (int64_t)tupregs[2].dttk_value;
3799
3800                                 /*
3801                                  * If the position argument to index() is
3802                                  * negative, Perl implicitly clamps it at
3803                                  * zero.  This semantic is a little surprising
3804                                  * given the special meaning of negative
3805                                  * positions to similar Perl functions like
3806                                  * substr(), but it appears to reflect a
3807                                  * notion that index() can start from a
3808                                  * negative index and increment its way up to
3809                                  * the string.  Given this notion, Perl's
3810                                  * rindex() is at least self-consistent in
3811                                  * that it implicitly clamps positions greater
3812                                  * than the string length to be the string
3813                                  * length.  Where Perl completely loses
3814                                  * coherence, however, is when the specified
3815                                  * substring is the empty string ("").  In
3816                                  * this case, even if the position is
3817                                  * negative, rindex() returns 0 -- and even if
3818                                  * the position is greater than the length,
3819                                  * index() returns the string length.  These
3820                                  * semantics violate the notion that index()
3821                                  * should never return a value less than the
3822                                  * specified position and that rindex() should
3823                                  * never return a value greater than the
3824                                  * specified position.  (One assumes that
3825                                  * these semantics are artifacts of Perl's
3826                                  * implementation and not the results of
3827                                  * deliberate design -- it beggars belief that
3828                                  * even Larry Wall could desire such oddness.)
3829                                  * While in the abstract one would wish for
3830                                  * consistent position semantics across
3831                                  * substr(), index() and rindex() -- or at the
3832                                  * very least self-consistent position
3833                                  * semantics for index() and rindex() -- we
3834                                  * instead opt to keep with the extant Perl
3835                                  * semantics, in all their broken glory.  (Do
3836                                  * we have more desire to maintain Perl's
3837                                  * semantics than Perl does?  Probably.)
3838                                  */
3839                                 if (subr == DIF_SUBR_RINDEX) {
3840                                         if (pos < 0) {
3841                                                 if (sublen == 0)
3842                                                         regs[rd] = 0;
3843                                                 break;
3844                                         }
3845
3846                                         if ((size_t)pos > len)
3847                                                 pos = len;
3848                                 } else {
3849                                         if (pos < 0)
3850                                                 pos = 0;
3851
3852                                         if ((size_t)pos >= len) {
3853                                                 if (sublen == 0)
3854                                                         regs[rd] = len;
3855                                                 break;
3856                                         }
3857                                 }
3858
3859                                 addr = orig + pos;
3860                         }
3861                 }
3862
3863                 for (regs[rd] = notfound; addr != limit; addr += inc) {
3864                         if (dtrace_strncmp(addr, substr, sublen) == 0) {
3865                                 if (subr != DIF_SUBR_STRSTR) {
3866                                         /*
3867                                          * As D index() and rindex() are
3868                                          * modeled on Perl (and not on awk),
3869                                          * we return a zero-based (and not a
3870                                          * one-based) index.  (For you Perl
3871                                          * weenies: no, we're not going to add
3872                                          * $[ -- and shouldn't you be at a con
3873                                          * or something?)
3874                                          */
3875                                         regs[rd] = (uintptr_t)(addr - orig);
3876                                         break;
3877                                 }
3878
3879                                 ASSERT(subr == DIF_SUBR_STRSTR);
3880                                 regs[rd] = (uintptr_t)addr;
3881                                 break;
3882                         }
3883                 }
3884
3885                 break;
3886         }
3887
3888         case DIF_SUBR_STRTOK: {
3889                 uintptr_t addr = tupregs[0].dttk_value;
3890                 uintptr_t tokaddr = tupregs[1].dttk_value;
3891                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3892                 uintptr_t limit, toklimit = tokaddr + size;
3893                 char *dest = (char *)mstate->dtms_scratch_ptr;
3894                 uint8_t c='\0', tokmap[32];      /* 256 / 8 */
3895                 uint64_t i = 0;
3896
3897                 /*
3898                  * Check both the token buffer and (later) the input buffer,
3899                  * since both could be non-scratch addresses.
3900                  */
3901                 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3902                         regs[rd] = 0;
3903                         break;
3904                 }
3905
3906                 if (!DTRACE_INSCRATCH(mstate, size)) {
3907                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3908                         regs[rd] = 0;
3909                         break;
3910                 }
3911
3912                 if (addr == 0) {
3913                         /*
3914                          * If the address specified is NULL, we use our saved
3915                          * strtok pointer from the mstate.  Note that this
3916                          * means that the saved strtok pointer is _only_
3917                          * valid within multiple enablings of the same probe --
3918                          * it behaves like an implicit clause-local variable.
3919                          */
3920                         addr = mstate->dtms_strtok;
3921                 } else {
3922                         /*
3923                          * If the user-specified address is non-NULL we must
3924                          * access check it.  This is the only time we have
3925                          * a chance to do so, since this address may reside
3926                          * in the string table of this clause-- future calls
3927                          * (when we fetch addr from mstate->dtms_strtok)
3928                          * would fail this access check.
3929                          */
3930                         if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3931                                 regs[rd] = 0;
3932                                 break;
3933                         }
3934                 }
3935
3936                 /*
3937                  * First, zero the token map, and then process the token
3938                  * string -- setting a bit in the map for every character
3939                  * found in the token string.
3940                  */
3941                 for (i = 0; i < (int)sizeof (tokmap); i++)
3942                         tokmap[i] = 0;
3943
3944                 for (; tokaddr < toklimit; tokaddr++) {
3945                         if ((c = dtrace_load8(tokaddr)) == '\0')
3946                                 break;
3947
3948                         ASSERT((c >> 3) < sizeof (tokmap));
3949                         tokmap[c >> 3] |= (1 << (c & 0x7));
3950                 }
3951
3952                 for (limit = addr + size; addr < limit; addr++) {
3953                         /*
3954                          * We're looking for a character that is _not_ contained
3955                          * in the token string.
3956                          */
3957                         if ((c = dtrace_load8(addr)) == '\0')
3958                                 break;
3959
3960                         if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3961                                 break;
3962                 }
3963
3964                 if (c == '\0') {
3965                         /*
3966                          * We reached the end of the string without finding
3967                          * any character that was not in the token string.
3968                          * We return NULL in this case, and we set the saved
3969                          * address to NULL as well.
3970                          */
3971                         regs[rd] = 0;
3972                         mstate->dtms_strtok = 0;
3973                         break;
3974                 }
3975
3976                 /*
3977                  * From here on, we're copying into the destination string.
3978                  */
3979                 for (i = 0; addr < limit && i < size - 1; addr++) {
3980                         if ((c = dtrace_load8(addr)) == '\0')
3981                                 break;
3982
3983                         if (tokmap[c >> 3] & (1 << (c & 0x7)))
3984                                 break;
3985
3986                         ASSERT(i < size);
3987                         dest[i++] = c;
3988                 }
3989
3990                 ASSERT(i < size);
3991                 dest[i] = '\0';
3992                 regs[rd] = (uintptr_t)dest;
3993                 mstate->dtms_scratch_ptr += size;
3994                 mstate->dtms_strtok = addr;
3995                 break;
3996         }
3997
3998         case DIF_SUBR_SUBSTR: {
3999                 uintptr_t s = tupregs[0].dttk_value;
4000                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4001                 char *d = (char *)mstate->dtms_scratch_ptr;
4002                 int64_t index = (int64_t)tupregs[1].dttk_value;
4003                 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4004                 size_t len = dtrace_strlen((char *)s, size);
4005                 int64_t i = 0;
4006
4007                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4008                         regs[rd] = 0;
4009                         break;
4010                 }
4011
4012                 if (!DTRACE_INSCRATCH(mstate, size)) {
4013                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4014                         regs[rd] = 0;
4015                         break;
4016                 }
4017
4018                 if (nargs <= 2)
4019                         remaining = (int64_t)size;
4020
4021                 if (index < 0) {
4022                         index += len;
4023
4024                         if (index < 0 && index + remaining > 0) {
4025                                 remaining += index;
4026                                 index = 0;
4027                         }
4028                 }
4029
4030                 if ((size_t)index >= len || index < 0) {
4031                         remaining = 0;
4032                 } else if (remaining < 0) {
4033                         remaining += len - index;
4034                 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4035                         remaining = size - index;
4036                 }
4037
4038                 for (i = 0; i < remaining; i++) {
4039                         if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4040                                 break;
4041                         }
4042
4043                 d[i] = '\0';
4044
4045                 mstate->dtms_scratch_ptr += size;
4046                 regs[rd] = (uintptr_t)d;
4047                 break;
4048         }
4049
4050         case DIF_SUBR_GETMAJOR:
4051                 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4052                 break;
4053
4054         case DIF_SUBR_GETMINOR:
4055                 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4056                 break;
4057
4058         case DIF_SUBR_DDI_PATHNAME: {
4059                 /* APPLE NOTE: currently unsupported on Darwin */
4060                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4061                 regs[rd] = 0;
4062                 break;
4063         }
4064
4065         case DIF_SUBR_STRJOIN: {
4066                 char *d = (char *)mstate->dtms_scratch_ptr;
4067                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4068                 uintptr_t s1 = tupregs[0].dttk_value;
4069                 uintptr_t s2 = tupregs[1].dttk_value;
4070                 uint64_t i = 0;
4071
4072                 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4073                     !dtrace_strcanload(s2, size, mstate, vstate)) {
4074                         regs[rd] = 0;
4075                         break;
4076                 }
4077
4078                 if (!DTRACE_INSCRATCH(mstate, size)) {
4079                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4080                         regs[rd] = 0;
4081                         break;
4082                 }
4083
4084                 for (;;) {
4085                         if (i >= size) {
4086                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4087                                 regs[rd] = 0;
4088                                 break;
4089                         }
4090
4091                         if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4092                                 i--;
4093                                 break;
4094                         }
4095                 }
4096
4097                 for (;;) {
4098                         if (i >= size) {
4099                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4100                                 regs[rd] = 0;
4101                                 break;
4102                         }
4103
4104                         if ((d[i++] = dtrace_load8(s2++)) == '\0')
4105                                 break;
4106                 }
4107
4108                 if (i < size) {
4109                         mstate->dtms_scratch_ptr += i;
4110                         regs[rd] = (uintptr_t)d;
4111                 }
4112
4113                 break;
4114         }
4115
4116         case DIF_SUBR_LLTOSTR: {
4117                 int64_t i = (int64_t)tupregs[0].dttk_value;
4118                 int64_t val = i < 0 ? i * -1 : i;
4119                 uint64_t size = 22;     /* enough room for 2^64 in decimal */
4120                 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4121
4122                 if (!DTRACE_INSCRATCH(mstate, size)) {
4123                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4124                         regs[rd] = 0;
4125                         break;
4126                 }
4127
4128                 for (*end-- = '\0'; val; val /= 10)
4129                         *end-- = '0' + (val % 10);
4130
4131                 if (i == 0)
4132                         *end-- = '0';
4133
4134                 if (i < 0)
4135                         *end-- = '-';
4136
4137                 regs[rd] = (uintptr_t)end + 1;
4138                 mstate->dtms_scratch_ptr += size;
4139                 break;
4140         }
4141
4142         case DIF_SUBR_HTONS:
4143         case DIF_SUBR_NTOHS:
4144 #ifdef _BIG_ENDIAN
4145                 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4146 #else
4147                 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4148 #endif
4149                 break;
4150
4151
4152         case DIF_SUBR_HTONL:
4153         case DIF_SUBR_NTOHL:
4154 #ifdef _BIG_ENDIAN
4155                 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4156 #else
4157                 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4158 #endif
4159                 break;
4160
4161
4162         case DIF_SUBR_HTONLL:
4163         case DIF_SUBR_NTOHLL:
4164 #ifdef _BIG_ENDIAN
4165                 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4166 #else
4167                 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4168 #endif
4169                 break;
4170
4171
4172         case DIF_SUBR_DIRNAME:
4173         case DIF_SUBR_BASENAME: {
4174                 char *dest = (char *)mstate->dtms_scratch_ptr;
4175                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4176                 uintptr_t src = tupregs[0].dttk_value;
4177                 int i, j, len = dtrace_strlen((char *)src, size);
4178                 int lastbase = -1, firstbase = -1, lastdir = -1;
4179                 int start, end;
4180
4181                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4182                         regs[rd] = 0;
4183                         break;
4184                 }
4185
4186                 if (!DTRACE_INSCRATCH(mstate, size)) {
4187                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4188                         regs[rd] = 0;
4189                         break;
4190                 }
4191
4192                 /*
4193                  * The basename and dirname for a zero-length string is
4194                  * defined to be "."
4195                  */
4196                 if (len == 0) {
4197                         len = 1;
4198                         src = (uintptr_t)".";
4199                 }
4200
4201                 /*
4202                  * Start from the back of the string, moving back toward the
4203                  * front until we see a character that isn't a slash.  That
4204                  * character is the last character in the basename.
4205                  */
4206                 for (i = len - 1; i >= 0; i--) {
4207                         if (dtrace_load8(src + i) != '/')
4208                                 break;
4209                 }
4210
4211                 if (i >= 0)
4212                         lastbase = i;
4213
4214                 /*
4215                  * Starting from the last character in the basename, move
4216                  * towards the front until we find a slash.  The character
4217                  * that we processed immediately before that is the first
4218                  * character in the basename.
4219                  */
4220                 for (; i >= 0; i--) {
4221                         if (dtrace_load8(src + i) == '/')
4222                                 break;
4223                 }
4224
4225                 if (i >= 0)
4226                         firstbase = i + 1;
4227
4228                 /*
4229                  * Now keep going until we find a non-slash character.  That
4230                  * character is the last character in the dirname.
4231                  */
4232                 for (; i >= 0; i--) {
4233                         if (dtrace_load8(src + i) != '/')
4234                                 break;
4235                 }
4236
4237                 if (i >= 0)
4238                         lastdir = i;
4239
4240                 ASSERT(!(lastbase == -1 && firstbase != -1));
4241                 ASSERT(!(firstbase == -1 && lastdir != -1));
4242
4243                 if (lastbase == -1) {
4244                         /*
4245                          * We didn't find a non-slash character.  We know that
4246                          * the length is non-zero, so the whole string must be
4247                          * slashes.  In either the dirname or the basename
4248                          * case, we return '/'.
4249                          */
4250                         ASSERT(firstbase == -1);
4251                         firstbase = lastbase = lastdir = 0;
4252                 }
4253
4254                 if (firstbase == -1) {
4255                         /*
4256                          * The entire string consists only of a basename
4257                          * component.  If we're looking for dirname, we need
4258                          * to change our string to be just "."; if we're
4259                          * looking for a basename, we'll just set the first
4260                          * character of the basename to be 0.
4261                          */
4262                         if (subr == DIF_SUBR_DIRNAME) {
4263                                 ASSERT(lastdir == -1);
4264                                 src = (uintptr_t)".";
4265                                 lastdir = 0;
4266                         } else {
4267                                 firstbase = 0;
4268                         }
4269                 }
4270
4271                 if (subr == DIF_SUBR_DIRNAME) {
4272                         if (lastdir == -1) {
4273                                 /*
4274                                  * We know that we have a slash in the name --
4275                                  * or lastdir would be set to 0, above.  And
4276                                  * because lastdir is -1, we know that this
4277                                  * slash must be the first character.  (That
4278                                  * is, the full string must be of the form
4279                                  * "/basename".)  In this case, the last
4280                                  * character of the directory name is 0.
4281                                  */
4282                                 lastdir = 0;
4283                         }
4284
4285                         start = 0;
4286                         end = lastdir;
4287                 } else {
4288                         ASSERT(subr == DIF_SUBR_BASENAME);
4289                         ASSERT(firstbase != -1 && lastbase != -1);
4290                         start = firstbase;
4291                         end = lastbase;
4292                 }
4293
4294                 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
4295                         dest[j] = dtrace_load8(src + i);
4296
4297                 dest[j] = '\0';
4298                 regs[rd] = (uintptr_t)dest;
4299                 mstate->dtms_scratch_ptr += size;
4300                 break;
4301         }
4302
4303         case DIF_SUBR_CLEANPATH: {
4304                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4305                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4306                 uintptr_t src = tupregs[0].dttk_value;
4307                 int i = 0, j = 0;
4308
4309                 if (!dtrace_strcanload(src, size, mstate, vstate)) {
4310                         regs[rd] = 0;
4311                         break;
4312                 }
4313
4314                 if (!DTRACE_INSCRATCH(mstate, size)) {
4315                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4316                         regs[rd] = 0;
4317                         break;
4318                 }
4319
4320                 /*
4321                  * Move forward, loading each character.
4322                  */
4323                 do {
4324                         c = dtrace_load8(src + i++);
4325 next:
4326                         if ((uint64_t)(j + 5) >= size)  /* 5 = strlen("/..c\0") */
4327                                 break;
4328
4329                         if (c != '/') {
4330                                 dest[j++] = c;
4331                                 continue;
4332                         }
4333
4334                         c = dtrace_load8(src + i++);
4335
4336                         if (c == '/') {
4337                                 /*
4338                                  * We have two slashes -- we can just advance
4339                                  * to the next character.
4340                                  */
4341                                 goto next;
4342                         }
4343
4344                         if (c != '.') {
4345                                 /*
4346                                  * This is not "." and it's not ".." -- we can
4347                                  * just store the "/" and this character and
4348                                  * drive on.
4349                                  */
4350                                 dest[j++] = '/';
4351                                 dest[j++] = c;
4352                                 continue;
4353                         }
4354
4355                         c = dtrace_load8(src + i++);
4356
4357                         if (c == '/') {
4358                                 /*
4359                                  * This is a "/./" component.  We're not going
4360                                  * to store anything in the destination buffer;
4361                                  * we're just going to go to the next component.
4362                                  */
4363                                 goto next;
4364                         }
4365
4366                         if (c != '.') {
4367                                 /*
4368                                  * This is not ".." -- we can just store the
4369                                  * "/." and this character and continue
4370                                  * processing.
4371                                  */
4372                                 dest[j++] = '/';
4373                                 dest[j++] = '.';
4374                                 dest[j++] = c;
4375                                 continue;
4376                         }
4377
4378                         c = dtrace_load8(src + i++);
4379
4380                         if (c != '/' && c != '\0') {
4381                                 /*
4382                                  * This is not ".." -- it's "..[mumble]".
4383                                  * We'll store the "/.." and this character
4384                                  * and continue processing.
4385                                  */
4386                                 dest[j++] = '/';
4387                                 dest[j++] = '.';
4388                                 dest[j++] = '.';
4389                                 dest[j++] = c;
4390                                 continue;
4391                         }
4392
4393                         /*
4394                          * This is "/../" or "/..\0".  We need to back up
4395                          * our destination pointer until we find a "/".
4396                          */
4397                         i--;
4398                         while (j != 0 && dest[--j] != '/')
4399                                 continue;
4400
4401                         if (c == '\0')
4402                                 dest[++j] = '/';
4403                 } while (c != '\0');
4404
4405                 dest[j] = '\0';
4406                 regs[rd] = (uintptr_t)dest;
4407                 mstate->dtms_scratch_ptr += size;
4408                 break;
4409         }
4410
4411         case DIF_SUBR_INET_NTOA:
4412         case DIF_SUBR_INET_NTOA6:
4413         case DIF_SUBR_INET_NTOP: {
4414                 size_t size;
4415                 int af, argi, i;
4416                 char *base, *end;
4417
4418                 if (subr == DIF_SUBR_INET_NTOP) {
4419                         af = (int)tupregs[0].dttk_value;
4420                         argi = 1;
4421                 } else {
4422                         af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4423                         argi = 0;
4424                 }
4425
4426                 if (af == AF_INET) {
4427 #if !defined(__APPLE__)
4428                         ipaddr_t ip4;
4429 #else
4430                         uint32_t ip4;
4431 #endif /* __APPLE__ */
4432                         uint8_t *ptr8, val;
4433
4434                         /*
4435                          * Safely load the IPv4 address.
4436                          */
4437 #if !defined(__APPLE__)
4438                         ip4 = dtrace_load32(tupregs[argi].dttk_value);
4439 #else
4440                         dtrace_bcopy(
4441                             (void *)(uintptr_t)tupregs[argi].dttk_value,
4442                             (void *)(uintptr_t)&ip4, sizeof (ip4));
4443 #endif /* __APPLE__ */
4444                         /*
4445                          * Check an IPv4 string will fit in scratch.
4446                          */
4447 #if !defined(__APPLE__)
4448                         size = INET_ADDRSTRLEN;
4449 #else
4450                         size = MAX_IPv4_STR_LEN;
4451 #endif /* __APPLE__ */
4452                         if (!DTRACE_INSCRATCH(mstate, size)) {
4453                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4454                                 regs[rd] = 0;
4455                                 break;
4456                         }
4457                         base = (char *)mstate->dtms_scratch_ptr;
4458                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
4459
4460                         /*
4461                          * Stringify as a dotted decimal quad.
4462                          */
4463                         *end-- = '\0';
4464                         ptr8 = (uint8_t *)&ip4;
4465                         for (i = 3; i >= 0; i--) {
4466                                 val = ptr8[i];
4467
4468                                 if (val == 0) {
4469                                         *end-- = '0';
4470                                 } else {
4471                                         for (; val; val /= 10) {
4472                                                 *end-- = '0' + (val % 10);
4473                                         }
4474                                 }
4475
4476                                 if (i > 0)
4477                                         *end-- = '.';
4478                         }
4479                         ASSERT(end + 1 >= base);
4480
4481                 } else if (af == AF_INET6) {
4482 #if defined(__APPLE__)
4483 #define _S6_un __u6_addr
4484 #define _S6_u8 __u6_addr8
4485 #endif /* __APPLE__ */
4486                         struct in6_addr ip6;
4487                         int firstzero, tryzero, numzero, v6end;
4488                         uint16_t val;
4489                         const char digits[] = "0123456789abcdef";
4490
4491                         /*
4492                          * Stringify using RFC 1884 convention 2 - 16 bit
4493                          * hexadecimal values with a zero-run compression.
4494                          * Lower case hexadecimal digits are used.
4495                          *      eg, fe80::214:4fff:fe0b:76c8.
4496                          * The IPv4 embedded form is returned for inet_ntop,
4497                          * just the IPv4 string is returned for inet_ntoa6.
4498                          */
4499
4500                         /*
4501                          * Safely load the IPv6 address.
4502                          */
4503                         dtrace_bcopy(
4504                             (void *)(uintptr_t)tupregs[argi].dttk_value,
4505                             (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4506
4507                         /*
4508                          * Check an IPv6 string will fit in scratch.
4509                          */
4510                         size = INET6_ADDRSTRLEN;
4511                         if (!DTRACE_INSCRATCH(mstate, size)) {
4512                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4513                                 regs[rd] = 0;
4514                                 break;
4515                         }
4516                         base = (char *)mstate->dtms_scratch_ptr;
4517                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
4518                         *end-- = '\0';
4519
4520                         /*
4521                          * Find the longest run of 16 bit zero values
4522                          * for the single allowed zero compression - "::".
4523                          */
4524                         firstzero = -1;
4525                         tryzero = -1;
4526                         numzero = 1;
4527                         for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
4528                                 if (ip6._S6_un._S6_u8[i] == 0 &&
4529                                     tryzero == -1 && i % 2 == 0) {
4530                                         tryzero = i;
4531                                         continue;
4532                                 }
4533
4534                                 if (tryzero != -1 &&
4535                                     (ip6._S6_un._S6_u8[i] != 0 ||
4536                                     i == sizeof (struct in6_addr) - 1)) {
4537
4538                                         if (i - tryzero <= numzero) {
4539                                                 tryzero = -1;
4540                                                 continue;
4541                                         }
4542
4543                                         firstzero = tryzero;
4544                                         numzero = i - i % 2 - tryzero;
4545                                         tryzero = -1;
4546
4547                                         if (ip6._S6_un._S6_u8[i] == 0 &&
4548                                             i == sizeof (struct in6_addr) - 1)
4549                                                 numzero += 2;
4550                                 }
4551                         }
4552                         ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
4553
4554                         /*
4555                          * Check for an IPv4 embedded address.
4556                          */
4557                         v6end = sizeof (struct in6_addr) - 2;
4558                         if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4559                             IN6_IS_ADDR_V4COMPAT(&ip6)) {
4560                                 for (i = sizeof (struct in6_addr) - 1;
4561                                      i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
4562                                         ASSERT(end >= base);
4563
4564                                         val = ip6._S6_un._S6_u8[i];
4565
4566                                         if (val == 0) {
4567                                                 *end-- = '0';
4568                                         } else {
4569                                                 for (; val; val /= 10) {
4570                                                         *end-- = '0' + val % 10;
4571                                                 }
4572                                         }
4573
4574                                         if (i > (int)DTRACE_V4MAPPED_OFFSET)
4575                                                 *end-- = '.';
4576                                 }
4577
4578                                 if (subr == DIF_SUBR_INET_NTOA6)
4579                                         goto inetout;
4580
4581                                 /*
4582                                  * Set v6end to skip the IPv4 address that
4583                                  * we have already stringified.
4584                                  */
4585                                 v6end = 10;
4586                         }
4587
4588                         /*
4589                          * Build the IPv6 string by working through the
4590                          * address in reverse.
4591                          */
4592                         for (i = v6end; i >= 0; i -= 2) {
4593                                 ASSERT(end >= base);
4594
4595                                 if (i == firstzero + numzero - 2) {
4596                                         *end-- = ':';
4597                                         *end-- = ':';
4598                                         i -= numzero - 2;
4599                                         continue;
4600                                 }
4601
4602                                 if (i < 14 && i != firstzero - 2)
4603                                         *end-- = ':';
4604
4605                                 val = (ip6._S6_un._S6_u8[i] << 8) +
4606                                     ip6._S6_un._S6_u8[i + 1];
4607
4608                                 if (val == 0) {
4609                                         *end-- = '0';
4610                                 } else {
4611                                         for (; val; val /= 16) {
4612                                                 *end-- = digits[val % 16];
4613                                         }
4614                                 }
4615                         }
4616                         ASSERT(end + 1 >= base);
4617
4618 #if defined(__APPLE__)
4619 #undef _S6_un
4620 #undef _S6_u8
4621 #endif /* __APPLE__ */
4622                 } else {
4623                         /*
4624                          * The user didn't use AH_INET or AH_INET6.
4625                          */
4626                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4627                         regs[rd] = 0;
4628                         break;
4629                 }
4630
4631 inetout:        regs[rd] = (uintptr_t)end + 1;
4632                 mstate->dtms_scratch_ptr += size;
4633                 break;
4634         }
4635
4636         case DIF_SUBR_TOUPPER:
4637         case DIF_SUBR_TOLOWER: {
4638                 uintptr_t src = tupregs[0].dttk_value;
4639                 char *dest = (char *)mstate->dtms_scratch_ptr;
4640                 char lower, upper, base, c;
4641                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4642                 size_t len = dtrace_strlen((char*) src, size);
4643                 size_t i = 0;
4644
4645                 lower = (subr == DIF_SUBR_TOUPPER) ? 'a' : 'A';
4646                 upper = (subr == DIF_SUBR_TOUPPER) ? 'z' : 'Z';
4647                 base  = (subr == DIF_SUBR_TOUPPER) ? 'A' : 'a';
4648
4649                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4650                         regs[rd] = 0;
4651                         break;
4652                 }
4653
4654                 if (!DTRACE_INSCRATCH(mstate, size)) {
4655                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4656                         regs[rd] = 0;
4657                         break;
4658                 }
4659
4660                 for (i = 0; i < size - 1; ++i) {
4661                         if ((c = dtrace_load8(src + i)) == '\0')
4662                                 break;
4663                         if (c >= lower && c <= upper)
4664                                 c = base + (c - lower);
4665                         dest[i] = c;
4666                 }
4667
4668                 ASSERT(i < size);
4669
4670                 dest[i] = '\0';
4671                 regs[rd] = (uintptr_t) dest;
4672                 mstate->dtms_scratch_ptr += size;
4673
4674                 break;
4675         }
4676
4677 /*
4678  * APPLE NOTE:
4679  * CoreProfile callback ('core_profile (uint64_t, [uint64_t], [uint64_t] ...)')
4680  */
4681         case DIF_SUBR_COREPROFILE: {
4682                 uint64_t selector = tupregs[0].dttk_value;
4683                 uint64_t args[DIF_DTR_NREGS-1] = {0ULL};
4684                 uint32_t ii;
4685                 uint32_t count = (uint32_t)nargs;
4686
4687                 if (count < 1) {
4688                     regs[rd] = KERN_FAILURE;
4689                     break;
4690                 }
4691
4692                 if(count > DIF_DTR_NREGS)
4693                     count = DIF_DTR_NREGS;
4694
4695                 /* copy in any variadic argument list, bounded by DIF_DTR_NREGS */
4696                 for(ii = 0; ii < count-1; ii++) {
4697                         args[ii] = tupregs[ii+1].dttk_value;
4698                 }
4699
4700                 kern_return_t ret =
4701                         chudxnu_dtrace_callback(selector, args, count-1);
4702                 if(KERN_SUCCESS != ret) {
4703                         /* error */
4704                 }
4705
4706                 regs[rd] = ret;
4707                 break;
4708         }
4709         }
4710 }
4711
4712 /*
4713  * Emulate the execution of DTrace IR instructions specified by the given
4714  * DIF object.  This function is deliberately void of assertions as all of
4715  * the necessary checks are handled by a call to dtrace_difo_validate().
4716  */
4717 static uint64_t
4718 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4719     dtrace_vstate_t *vstate, dtrace_state_t *state)
4720 {
4721         const dif_instr_t *text = difo->dtdo_buf;
4722         const uint_t textlen = difo->dtdo_len;
4723         const char *strtab = difo->dtdo_strtab;
4724         const uint64_t *inttab = difo->dtdo_inttab;
4725
4726         uint64_t rval = 0;
4727         dtrace_statvar_t *svar;
4728         dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4729         dtrace_difv_t *v;
4730         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4731         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4732
4733         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4734         uint64_t regs[DIF_DIR_NREGS];
4735         uint64_t *tmp;
4736
4737         uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4738         int64_t cc_r;
4739         uint_t pc = 0, id, opc = 0;
4740         uint8_t ttop = 0;
4741         dif_instr_t instr;
4742         uint_t r1, r2, rd;
4743
4744         /*
4745          * We stash the current DIF object into the machine state: we need it
4746          * for subsequent access checking.
4747          */
4748         mstate->dtms_difo = difo;
4749
4750         regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
4751
4752         while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4753                 opc = pc;
4754
4755                 instr = text[pc++];
4756                 r1 = DIF_INSTR_R1(instr);
4757                 r2 = DIF_INSTR_R2(instr);
4758                 rd = DIF_INSTR_RD(instr);
4759
4760                 switch (DIF_INSTR_OP(instr)) {
4761                 case DIF_OP_OR:
4762                         regs[rd] = regs[r1] | regs[r2];
4763                         break;
4764                 case DIF_OP_XOR:
4765                         regs[rd] = regs[r1] ^ regs[r2];
4766                         break;
4767                 case DIF_OP_AND:
4768                         regs[rd] = regs[r1] & regs[r2];
4769                         break;
4770                 case DIF_OP_SLL:
4771                         regs[rd] = regs[r1] << regs[r2];
4772                         break;
4773                 case DIF_OP_SRL:
4774                         regs[rd] = regs[r1] >> regs[r2];
4775                         break;
4776                 case DIF_OP_SUB:
4777                         regs[rd] = regs[r1] - regs[r2];
4778                         break;
4779                 case DIF_OP_ADD:
4780                         regs[rd] = regs[r1] + regs[r2];
4781                         break;
4782                 case DIF_OP_MUL:
4783                         regs[rd] = regs[r1] * regs[r2];
4784                         break;
4785                 case DIF_OP_SDIV:
4786                         if (regs[r2] == 0) {
4787                                 regs[rd] = 0;
4788                                 *flags |= CPU_DTRACE_DIVZERO;
4789                         } else {
4790                                 regs[rd] = (int64_t)regs[r1] /
4791                                     (int64_t)regs[r2];
4792                         }
4793                         break;
4794
4795                 case DIF_OP_UDIV:
4796                         if (regs[r2] == 0) {
4797                                 regs[rd] = 0;
4798                                 *flags |= CPU_DTRACE_DIVZERO;
4799                         } else {
4800                                 regs[rd] = regs[r1] / regs[r2];
4801                         }
4802                         break;
4803
4804                 case DIF_OP_SREM:
4805                         if (regs[r2] == 0) {
4806                                 regs[rd] = 0;
4807                                 *flags |= CPU_DTRACE_DIVZERO;
4808                         } else {
4809                                 regs[rd] = (int64_t)regs[r1] %
4810                                     (int64_t)regs[r2];
4811                         }
4812                         break;
4813
4814                 case DIF_OP_UREM:
4815                         if (regs[r2] == 0) {
4816                                 regs[rd] = 0;
4817                                 *flags |= CPU_DTRACE_DIVZERO;
4818                         } else {
4819                                 regs[rd] = regs[r1] % regs[r2];
4820                         }
4821                         break;
4822
4823                 case DIF_OP_NOT:
4824                         regs[rd] = ~regs[r1];
4825                         break;
4826                 case DIF_OP_MOV:
4827                         regs[rd] = regs[r1];
4828                         break;
4829                 case DIF_OP_CMP:
4830                         cc_r = regs[r1] - regs[r2];
4831                         cc_n = cc_r < 0;
4832                         cc_z = cc_r == 0;
4833                         cc_v = 0;
4834                         cc_c = regs[r1] < regs[r2];
4835                         break;
4836                 case DIF_OP_TST:
4837                         cc_n = cc_v = cc_c = 0;
4838                         cc_z = regs[r1] == 0;
4839                         break;
4840                 case DIF_OP_BA:
4841                         pc = DIF_INSTR_LABEL(instr);
4842                         break;
4843                 case DIF_OP_BE:
4844                         if (cc_z)
4845                                 pc = DIF_INSTR_LABEL(instr);
4846                         break;
4847                 case DIF_OP_BNE:
4848                         if (cc_z == 0)
4849                                 pc = DIF_INSTR_LABEL(instr);
4850                         break;
4851                 case DIF_OP_BG:
4852                         if ((cc_z | (cc_n ^ cc_v)) == 0)
4853                                 pc = DIF_INSTR_LABEL(instr);
4854                         break;
4855                 case DIF_OP_BGU:
4856                         if ((cc_c | cc_z) == 0)
4857                                 pc = DIF_INSTR_LABEL(instr);
4858                         break;
4859                 case DIF_OP_BGE:
4860                         if ((cc_n ^ cc_v) == 0)
4861                                 pc = DIF_INSTR_LABEL(instr);
4862                         break;
4863                 case DIF_OP_BGEU:
4864                         if (cc_c == 0)
4865                                 pc = DIF_INSTR_LABEL(instr);
4866                         break;
4867                 case DIF_OP_BL:
4868                         if (cc_n ^ cc_v)
4869                                 pc = DIF_INSTR_LABEL(instr);
4870                         break;
4871                 case DIF_OP_BLU:
4872                         if (cc_c)
4873                                 pc = DIF_INSTR_LABEL(instr);
4874                         break;
4875                 case DIF_OP_BLE:
4876                         if (cc_z | (cc_n ^ cc_v))
4877                                 pc = DIF_INSTR_LABEL(instr);
4878                         break;
4879                 case DIF_OP_BLEU:
4880                         if (cc_c | cc_z)
4881                                 pc = DIF_INSTR_LABEL(instr);
4882                         break;
4883                 case DIF_OP_RLDSB:
4884                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4885                                 *flags |= CPU_DTRACE_KPRIV;
4886                                 *illval = regs[r1];
4887                                 break;
4888                         }
4889                         /*FALLTHROUGH*/
4890                 case DIF_OP_LDSB:
4891                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
4892                         break;
4893                 case DIF_OP_RLDSH:
4894                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4895                                 *flags |= CPU_DTRACE_KPRIV;
4896                                 *illval = regs[r1];
4897                                 break;
4898                         }
4899                         /*FALLTHROUGH*/
4900                 case DIF_OP_LDSH:
4901                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
4902                         break;
4903                 case DIF_OP_RLDSW:
4904                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4905                                 *flags |= CPU_DTRACE_KPRIV;
4906                                 *illval = regs[r1];
4907                                 break;
4908                         }
4909                         /*FALLTHROUGH*/
4910                 case DIF_OP_LDSW:
4911                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
4912                         break;
4913                 case DIF_OP_RLDUB:
4914                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4915                                 *flags |= CPU_DTRACE_KPRIV;
4916                                 *illval = regs[r1];
4917                                 break;
4918                         }
4919                         /*FALLTHROUGH*/
4920                 case DIF_OP_LDUB:
4921                         regs[rd] = dtrace_load8(regs[r1]);
4922                         break;
4923                 case DIF_OP_RLDUH:
4924                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4925                                 *flags |= CPU_DTRACE_KPRIV;
4926                                 *illval = regs[r1];
4927                                 break;
4928                         }
4929                         /*FALLTHROUGH*/
4930                 case DIF_OP_LDUH:
4931                         regs[rd] = dtrace_load16(regs[r1]);
4932                         break;
4933                 case DIF_OP_RLDUW:
4934                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4935                                 *flags |= CPU_DTRACE_KPRIV;
4936                                 *illval = regs[r1];
4937                                 break;
4938                         }
4939                         /*FALLTHROUGH*/
4940                 case DIF_OP_LDUW:
4941                         regs[rd] = dtrace_load32(regs[r1]);
4942                         break;
4943                 case DIF_OP_RLDX:
4944                         if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
4945                                 *flags |= CPU_DTRACE_KPRIV;
4946                                 *illval = regs[r1];
4947                                 break;
4948                         }
4949                         /*FALLTHROUGH*/
4950                 case DIF_OP_LDX:
4951                         regs[rd] = dtrace_load64(regs[r1]);
4952                         break;
4953 /*
4954  * Darwin 32-bit kernel may fetch from 64-bit user.
4955  * Do not cast regs to uintptr_t
4956  * DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
4957  * DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
4958  */
4959                 case DIF_OP_ULDSB:
4960                         regs[rd] = (int8_t)
4961                             dtrace_fuword8(regs[r1]);
4962                         break;
4963                 case DIF_OP_ULDSH:
4964                         regs[rd] = (int16_t)
4965                             dtrace_fuword16(regs[r1]);
4966                         break;
4967                 case DIF_OP_ULDSW:
4968                         regs[rd] = (int32_t)
4969                             dtrace_fuword32(regs[r1]);
4970                         break;
4971                 case DIF_OP_ULDUB:
4972                         regs[rd] =
4973                             dtrace_fuword8(regs[r1]);
4974                         break;
4975                 case DIF_OP_ULDUH:
4976                         regs[rd] =
4977                             dtrace_fuword16(regs[r1]);
4978                         break;
4979                 case DIF_OP_ULDUW:
4980                         regs[rd] =
4981                             dtrace_fuword32(regs[r1]);
4982                         break;
4983                 case DIF_OP_ULDX:
4984                         regs[rd] =
4985                             dtrace_fuword64(regs[r1]);
4986                         break;
4987                 case DIF_OP_RET:
4988                         rval = regs[rd];
4989                         pc = textlen;
4990                         break;
4991                 case DIF_OP_NOP:
4992                         break;
4993                 case DIF_OP_SETX:
4994                         regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
4995                         break;
4996                 case DIF_OP_SETS:
4997                         regs[rd] = (uint64_t)(uintptr_t)
4998                             (strtab + DIF_INSTR_STRING(instr));
4999                         break;
5000                 case DIF_OP_SCMP: {
5001                         size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5002                         uintptr_t s1 = regs[r1];
5003                         uintptr_t s2 = regs[r2];
5004
5005                         if (s1 != 0 &&
5006                             !dtrace_strcanload(s1, sz, mstate, vstate))
5007                                 break;
5008                         if (s2 != 0 &&
5009                             !dtrace_strcanload(s2, sz, mstate, vstate))
5010                                 break;
5011
5012                         cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
5013
5014                         cc_n = cc_r < 0;
5015                         cc_z = cc_r == 0;
5016                         cc_v = cc_c = 0;
5017                         break;
5018                 }
5019                 case DIF_OP_LDGA:
5020                         regs[rd] = dtrace_dif_variable(mstate, state,
5021                             r1, regs[r2]);
5022                         break;
5023                 case DIF_OP_LDGS:
5024                         id = DIF_INSTR_VAR(instr);
5025
5026                         if (id >= DIF_VAR_OTHER_UBASE) {
5027                                 uintptr_t a;
5028
5029                                 id -= DIF_VAR_OTHER_UBASE;
5030                                 svar = vstate->dtvs_globals[id];
5031                                 ASSERT(svar != NULL);
5032                                 v = &svar->dtsv_var;
5033
5034                                 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5035                                         regs[rd] = svar->dtsv_data;
5036                                         break;
5037                                 }
5038
5039                                 a = (uintptr_t)svar->dtsv_data;
5040
5041                                 if (*(uint8_t *)a == UINT8_MAX) {
5042                                         /*
5043                                          * If the 0th byte is set to UINT8_MAX
5044                                          * then this is to be treated as a
5045                                          * reference to a NULL variable.
5046                                          */
5047                                         regs[rd] = 0;
5048                                 } else {
5049                                         regs[rd] = a + sizeof (uint64_t);
5050                                 }
5051
5052                                 break;
5053                         }
5054
5055                         regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5056                         break;
5057
5058                 case DIF_OP_STGS:
5059                         id = DIF_INSTR_VAR(instr);
5060
5061                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5062                         id -= DIF_VAR_OTHER_UBASE;
5063
5064                         svar = vstate->dtvs_globals[id];
5065                         ASSERT(svar != NULL);
5066                         v = &svar->dtsv_var;
5067
5068                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5069                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5070
5071                                 ASSERT(a != 0);
5072                                 ASSERT(svar->dtsv_size != 0);
5073
5074                                 if (regs[rd] == 0) {
5075                                         *(uint8_t *)a = UINT8_MAX;
5076                                         break;
5077                                 } else {
5078                                         *(uint8_t *)a = 0;
5079                                         a += sizeof (uint64_t);
5080                                 }
5081                                 if (!dtrace_vcanload(
5082                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5083                                     mstate, vstate))
5084                                         break;
5085
5086                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5087                                     (void *)a, &v->dtdv_type);
5088                                 break;
5089                         }
5090
5091                         svar->dtsv_data = regs[rd];
5092                         break;
5093
5094                 case DIF_OP_LDTA:
5095                         /*
5096                          * There are no DTrace built-in thread-local arrays at
5097                          * present.  This opcode is saved for future work.
5098                          */
5099                         *flags |= CPU_DTRACE_ILLOP;
5100                         regs[rd] = 0;
5101                         break;
5102
5103                 case DIF_OP_LDLS:
5104                         id = DIF_INSTR_VAR(instr);
5105
5106                         if (id < DIF_VAR_OTHER_UBASE) {
5107                                 /*
5108                                  * For now, this has no meaning.
5109                                  */
5110                                 regs[rd] = 0;
5111                                 break;
5112                         }
5113
5114                         id -= DIF_VAR_OTHER_UBASE;
5115
5116                         ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5117                         ASSERT(vstate->dtvs_locals != NULL);
5118                         svar = vstate->dtvs_locals[id];
5119                         ASSERT(svar != NULL);
5120                         v = &svar->dtsv_var;
5121
5122                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5123                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5124                                 size_t sz = v->dtdv_type.dtdt_size;
5125
5126                                 sz += sizeof (uint64_t);
5127                                 ASSERT(svar->dtsv_size == (int)NCPU * sz);
5128                                 a += CPU->cpu_id * sz;
5129
5130                                 if (*(uint8_t *)a == UINT8_MAX) {
5131                                         /*
5132                                          * If the 0th byte is set to UINT8_MAX
5133                                          * then this is to be treated as a
5134                                          * reference to a NULL variable.
5135                                          */
5136                                         regs[rd] = 0;
5137                                 } else {
5138                                         regs[rd] = a + sizeof (uint64_t);
5139                                 }
5140
5141                                 break;
5142                         }
5143
5144                         ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5145                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5146                         regs[rd] = tmp[CPU->cpu_id];
5147                         break;
5148
5149                 case DIF_OP_STLS:
5150                         id = DIF_INSTR_VAR(instr);
5151
5152                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5153                         id -= DIF_VAR_OTHER_UBASE;
5154                         ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5155                         ASSERT(vstate->dtvs_locals != NULL);
5156                         svar = vstate->dtvs_locals[id];
5157                         ASSERT(svar != NULL);
5158                         v = &svar->dtsv_var;
5159
5160                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5161                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5162                                 size_t sz = v->dtdv_type.dtdt_size;
5163
5164                                 sz += sizeof (uint64_t);
5165                                 ASSERT(svar->dtsv_size == (int)NCPU * sz);
5166                                 a += CPU->cpu_id * sz;
5167
5168                                 if (regs[rd] == 0) {
5169                                         *(uint8_t *)a = UINT8_MAX;
5170                                         break;
5171                                 } else {
5172                                         *(uint8_t *)a = 0;
5173                                         a += sizeof (uint64_t);
5174                                 }
5175
5176                                 if (!dtrace_vcanload(
5177                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5178                                     mstate, vstate))
5179                                         break;
5180
5181                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5182                                     (void *)a, &v->dtdv_type);
5183                                 break;
5184                         }
5185
5186                         ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5187                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5188                         tmp[CPU->cpu_id] = regs[rd];
5189                         break;
5190
5191                 case DIF_OP_LDTS: {
5192                         dtrace_dynvar_t *dvar;
5193                         dtrace_key_t *key;
5194
5195                         id = DIF_INSTR_VAR(instr);
5196                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5197                         id -= DIF_VAR_OTHER_UBASE;
5198                         v = &vstate->dtvs_tlocals[id];
5199
5200                         key = &tupregs[DIF_DTR_NREGS];
5201                         key[0].dttk_value = (uint64_t)id;
5202                         key[0].dttk_size = 0;
5203                         DTRACE_TLS_THRKEY(key[1].dttk_value);
5204                         key[1].dttk_size = 0;
5205
5206                         dvar = dtrace_dynvar(dstate, 2, key,
5207                             sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5208                             mstate, vstate);
5209
5210                         if (dvar == NULL) {
5211                                 regs[rd] = 0;
5212                                 break;
5213                         }
5214
5215                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5216                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5217                         } else {
5218                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5219                         }
5220
5221                         break;
5222                 }
5223
5224                 case DIF_OP_STTS: {
5225                         dtrace_dynvar_t *dvar;
5226                         dtrace_key_t *key;
5227
5228                         id = DIF_INSTR_VAR(instr);
5229                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5230                         id -= DIF_VAR_OTHER_UBASE;
5231
5232                         key = &tupregs[DIF_DTR_NREGS];
5233                         key[0].dttk_value = (uint64_t)id;
5234                         key[0].dttk_size = 0;
5235                         DTRACE_TLS_THRKEY(key[1].dttk_value);
5236                         key[1].dttk_size = 0;
5237                         v = &vstate->dtvs_tlocals[id];
5238
5239                         dvar = dtrace_dynvar(dstate, 2, key,
5240                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5241                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5242                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
5243                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5244
5245                         /*
5246                          * Given that we're storing to thread-local data,
5247                          * we need to flush our predicate cache.
5248                          */
5249                         dtrace_set_thread_predcache(current_thread(), 0);
5250
5251                         if (dvar == NULL)
5252                                 break;
5253
5254                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5255                                 if (!dtrace_vcanload(
5256                                     (void *)(uintptr_t)regs[rd],
5257                                     &v->dtdv_type, mstate, vstate))
5258                                         break;
5259
5260                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5261                                     dvar->dtdv_data, &v->dtdv_type);
5262                         } else {
5263                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5264                         }
5265
5266                         break;
5267                 }
5268
5269                 case DIF_OP_SRA:
5270                         regs[rd] = (int64_t)regs[r1] >> regs[r2];
5271                         break;
5272
5273                 case DIF_OP_CALL:
5274                         dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5275                             regs, tupregs, ttop, mstate, state);
5276                         break;
5277
5278                 case DIF_OP_PUSHTR:
5279                         if (ttop == DIF_DTR_NREGS) {
5280                                 *flags |= CPU_DTRACE_TUPOFLOW;
5281                                 break;
5282                         }
5283
5284                         if (r1 == DIF_TYPE_STRING) {
5285                                 /*
5286                                  * If this is a string type and the size is 0,
5287                                  * we'll use the system-wide default string
5288                                  * size.  Note that we are _not_ looking at
5289                                  * the value of the DTRACEOPT_STRSIZE option;
5290                                  * had this been set, we would expect to have
5291                                  * a non-zero size value in the "pushtr".
5292                                  */
5293                                 tupregs[ttop].dttk_size =
5294                                     dtrace_strlen((char *)(uintptr_t)regs[rd],
5295                                     regs[r2] ? regs[r2] :
5296                                     dtrace_strsize_default) + 1;
5297                         } else {
5298                                 tupregs[ttop].dttk_size = regs[r2];
5299                         }
5300
5301                         tupregs[ttop++].dttk_value = regs[rd];
5302                         break;
5303
5304                 case DIF_OP_PUSHTV:
5305                         if (ttop == DIF_DTR_NREGS) {
5306                                 *flags |= CPU_DTRACE_TUPOFLOW;
5307                                 break;
5308                         }
5309
5310                         tupregs[ttop].dttk_value = regs[rd];
5311                         tupregs[ttop++].dttk_size = 0;
5312                         break;
5313
5314                 case DIF_OP_POPTS:
5315                         if (ttop != 0)
5316                                 ttop--;
5317                         break;
5318
5319                 case DIF_OP_FLUSHTS:
5320                         ttop = 0;
5321                         break;
5322
5323                 case DIF_OP_LDGAA:
5324                 case DIF_OP_LDTAA: {
5325                         dtrace_dynvar_t *dvar;
5326                         dtrace_key_t *key = tupregs;
5327                         uint_t nkeys = ttop;
5328
5329                         id = DIF_INSTR_VAR(instr);
5330                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5331                         id -= DIF_VAR_OTHER_UBASE;
5332
5333                         key[nkeys].dttk_value = (uint64_t)id;
5334                         key[nkeys++].dttk_size = 0;
5335
5336                         if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5337                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5338                                 key[nkeys++].dttk_size = 0;
5339                                 v = &vstate->dtvs_tlocals[id];
5340                         } else {
5341                                 v = &vstate->dtvs_globals[id]->dtsv_var;
5342                         }
5343
5344                         dvar = dtrace_dynvar(dstate, nkeys, key,
5345                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5346                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5347                             DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5348
5349                         if (dvar == NULL) {
5350                                 regs[rd] = 0;
5351                                 break;
5352                         }
5353
5354                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5355                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5356                         } else {
5357                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5358                         }
5359
5360                         break;
5361                 }
5362
5363                 case DIF_OP_STGAA:
5364                 case DIF_OP_STTAA: {
5365                         dtrace_dynvar_t *dvar;
5366                         dtrace_key_t *key = tupregs;
5367                         uint_t nkeys = ttop;
5368
5369                         id = DIF_INSTR_VAR(instr);
5370                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5371                         id -= DIF_VAR_OTHER_UBASE;
5372
5373                         key[nkeys].dttk_value = (uint64_t)id;
5374                         key[nkeys++].dttk_size = 0;
5375
5376                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5377                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5378                                 key[nkeys++].dttk_size = 0;
5379                                 v = &vstate->dtvs_tlocals[id];
5380                         } else {
5381                                 v = &vstate->dtvs_globals[id]->dtsv_var;
5382                         }
5383
5384                         dvar = dtrace_dynvar(dstate, nkeys, key,
5385                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5386                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5387                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
5388                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5389
5390                         if (dvar == NULL)
5391                                 break;
5392
5393                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5394                                 if (!dtrace_vcanload(
5395                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5396                                     mstate, vstate))
5397                                         break;
5398
5399                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5400                                     dvar->dtdv_data, &v->dtdv_type);
5401                         } else {
5402                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5403                         }
5404
5405                         break;
5406                 }
5407
5408                 case DIF_OP_ALLOCS: {
5409                         uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5410                         size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5411
5412                         /*
5413                          * Rounding up the user allocation size could have
5414                          * overflowed large, bogus allocations (like -1ULL) to
5415                          * 0.
5416                          */
5417                         if (size < regs[r1] ||
5418                             !DTRACE_INSCRATCH(mstate, size)) {
5419                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5420                                 regs[rd] = 0;
5421                                 break;
5422                         }
5423
5424                         dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5425                                 mstate->dtms_scratch_ptr += size;
5426                                 regs[rd] = ptr;
5427                         break;
5428                 }
5429
5430                 case DIF_OP_COPYS:
5431                         if (!dtrace_canstore(regs[rd], regs[r2],
5432                             mstate, vstate)) {
5433                                 *flags |= CPU_DTRACE_BADADDR;
5434                                 *illval = regs[rd];
5435                                 break;
5436                         }
5437
5438                         if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5439                                 break;
5440
5441                         dtrace_bcopy((void *)(uintptr_t)regs[r1],
5442                             (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5443                         break;
5444
5445                 case DIF_OP_STB:
5446                         if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5447                                 *flags |= CPU_DTRACE_BADADDR;
5448                                 *illval = regs[rd];
5449                                 break;
5450                         }
5451                         *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5452                         break;
5453
5454                 case DIF_OP_STH:
5455                         if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5456                                 *flags |= CPU_DTRACE_BADADDR;
5457                                 *illval = regs[rd];
5458                                 break;
5459                         }
5460                         if (regs[rd] & 1) {
5461                                 *flags |= CPU_DTRACE_BADALIGN;
5462                                 *illval = regs[rd];
5463                                 break;
5464                         }
5465                         *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5466                         break;
5467
5468                 case DIF_OP_STW:
5469                         if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5470                                 *flags |= CPU_DTRACE_BADADDR;
5471                                 *illval = regs[rd];
5472                                 break;
5473                         }
5474                         if (regs[rd] & 3) {
5475                                 *flags |= CPU_DTRACE_BADALIGN;
5476                                 *illval = regs[rd];
5477                                 break;
5478                         }
5479                         *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5480                         break;
5481
5482                 case DIF_OP_STX:
5483                         if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5484                                 *flags |= CPU_DTRACE_BADADDR;
5485                                 *illval = regs[rd];
5486                                 break;
5487                         }
5488
5489                         /*
5490                         * Darwin kmem_zalloc() called from
5491                         * dtrace_difo_init() is 4-byte aligned.
5492                         */
5493                         if (regs[rd] & 3) {
5494                                 *flags |= CPU_DTRACE_BADALIGN;
5495                                 *illval = regs[rd];
5496                                 break;
5497                         }
5498                         *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5499                         break;
5500                 }
5501         }
5502
5503         if (!(*flags & CPU_DTRACE_FAULT))
5504                 return (rval);
5505
5506         mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5507         mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5508
5509         return (0);
5510 }
5511
5512 static void
5513 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5514 {
5515         dtrace_probe_t *probe = ecb->dte_probe;
5516         dtrace_provider_t *prov = probe->dtpr_provider;
5517         char c[DTRACE_FULLNAMELEN + 80], *str;
5518         const char *msg = "dtrace: breakpoint action at probe ";
5519         const char *ecbmsg = " (ecb ";
5520         uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5521         uintptr_t val = (uintptr_t)ecb;
5522         int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5523
5524         if (dtrace_destructive_disallow)
5525                 return;
5526
5527         /*
5528          * It's impossible to be taking action on the NULL probe.
5529          */
5530         ASSERT(probe != NULL);
5531
5532         /*
5533          * This is a poor man's (destitute man's?) sprintf():  we want to
5534          * print the provider name, module name, function name and name of
5535          * the probe, along with the hex address of the ECB with the breakpoint
5536          * action -- all of which we must place in the character buffer by
5537          * hand.
5538          */
5539         while (*msg != '\0')
5540                 c[i++] = *msg++;
5541
5542         for (str = prov->dtpv_name; *str != '\0'; str++)
5543                 c[i++] = *str;
5544         c[i++] = ':';
5545
5546         for (str = probe->dtpr_mod; *str != '\0'; str++)
5547                 c[i++] = *str;
5548         c[i++] = ':';
5549
5550         for (str = probe->dtpr_func; *str != '\0'; str++)
5551                 c[i++] = *str;
5552         c[i++] = ':';
5553
5554         for (str = probe->dtpr_name; *str != '\0'; str++)
5555                 c[i++] = *str;
5556
5557         while (*ecbmsg != '\0')
5558                 c[i++] = *ecbmsg++;
5559
5560         while (shift >= 0) {
5561                 mask = (uintptr_t)0xf << shift;
5562
5563                 if (val >= ((uintptr_t)1 << shift))
5564                         c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5565                 shift -= 4;
5566         }
5567
5568         c[i++] = ')';
5569         c[i] = '\0';
5570
5571         debug_enter(c);
5572 }
5573
5574 static void
5575 dtrace_action_panic(dtrace_ecb_t *ecb)
5576 {
5577         dtrace_probe_t *probe = ecb->dte_probe;
5578
5579         /*
5580          * It's impossible to be taking action on the NULL probe.
5581          */
5582         ASSERT(probe != NULL);
5583
5584         if (dtrace_destructive_disallow)
5585                 return;
5586
5587         if (dtrace_panicked != NULL)
5588                 return;
5589
5590         if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
5591                 return;
5592
5593         /*
5594          * We won the right to panic.  (We want to be sure that only one
5595          * thread calls panic() from dtrace_probe(), and that panic() is
5596          * called exactly once.)
5597          */
5598         panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5599             probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5600             probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5601
5602         /*
5603          * APPLE NOTE: this was for an old Mac OS X debug feature
5604          * allowing a return from panic().  Revisit someday.
5605          */
5606         dtrace_panicked = NULL;
5607 }
5608
5609 static void
5610 dtrace_action_raise(uint64_t sig)
5611 {
5612         if (dtrace_destructive_disallow)
5613                 return;
5614
5615         if (sig >= NSIG) {
5616                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5617                 return;
5618         }
5619
5620         /*
5621          * raise() has a queue depth of 1 -- we ignore all subsequent
5622          * invocations of the raise() action.
5623          */
5624
5625         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5626
5627         if (uthread && uthread->t_dtrace_sig == 0) {
5628                 uthread->t_dtrace_sig = sig;
5629                 act_set_astbsd(current_thread());
5630         }
5631 }
5632
5633 static void
5634 dtrace_action_stop(void)
5635 {
5636         if (dtrace_destructive_disallow)
5637                 return;
5638
5639         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5640         if (uthread) {
5641                 /*
5642                  * The currently running process will be set to task_suspend
5643                  * when it next leaves the kernel.
5644                 */
5645                 uthread->t_dtrace_stop = 1;
5646                 act_set_astbsd(current_thread());
5647         }
5648 }
5649
5650
5651 /*
5652  * APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
5653  * Both activate only when the currently running process next leaves the
5654  * kernel.
5655  */
5656 static void
5657 dtrace_action_pidresume(uint64_t pid)
5658 {
5659         if (dtrace_destructive_disallow)
5660                 return;
5661
5662         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5663                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5664                 return;
5665         }
5666         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5667
5668         /*
5669          * When the currently running process leaves the kernel, it attempts to
5670          * task_resume the process (denoted by pid), if that pid appears to have
5671          * been stopped by dtrace_action_stop().
5672          * The currently running process has a pidresume() queue depth of 1 --
5673          * subsequent invocations of the pidresume() action are ignored.
5674          */
5675
5676         if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
5677                 uthread->t_dtrace_resumepid = pid;
5678                 act_set_astbsd(current_thread());
5679         }
5680 }
5681
5682 static void
5683 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5684 {
5685         hrtime_t now;
5686         volatile uint16_t *flags;
5687         dtrace_cpu_t *cpu = CPU;
5688
5689         if (dtrace_destructive_disallow)
5690                 return;
5691
5692         flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5693
5694         now = dtrace_gethrtime();
5695
5696         if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5697                 /*
5698                  * We need to advance the mark to the current time.
5699                  */
5700                 cpu->cpu_dtrace_chillmark = now;
5701                 cpu->cpu_dtrace_chilled = 0;
5702         }
5703
5704         /*
5705          * Now check to see if the requested chill time would take us over
5706          * the maximum amount of time allowed in the chill interval.  (Or
5707          * worse, if the calculation itself induces overflow.)
5708          */
5709         if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5710             cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5711                 *flags |= CPU_DTRACE_ILLOP;
5712                 return;
5713         }
5714
5715         while (dtrace_gethrtime() - now < val)
5716                 continue;
5717
5718         /*
5719          * Normally, we assure that the value of the variable "timestamp" does
5720          * not change within an ECB.  The presence of chill() represents an
5721          * exception to this rule, however.
5722          */
5723         mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5724         cpu->cpu_dtrace_chilled += val;
5725 }
5726
5727 static void
5728 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5729     uint64_t *buf, uint64_t arg)
5730 {
5731         int nframes = DTRACE_USTACK_NFRAMES(arg);
5732         int strsize = DTRACE_USTACK_STRSIZE(arg);
5733         uint64_t *pcs = &buf[1], *fps;
5734         char *str = (char *)&pcs[nframes];
5735         int size, offs = 0, i, j;
5736         uintptr_t old = mstate->dtms_scratch_ptr, saved;
5737         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5738         char *sym;
5739
5740         /*
5741          * Should be taking a faster path if string space has not been
5742          * allocated.
5743          */
5744         ASSERT(strsize != 0);
5745
5746         /*
5747          * We will first allocate some temporary space for the frame pointers.
5748          */
5749         fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5750         size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5751             (nframes * sizeof (uint64_t));
5752
5753         if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
5754                 /*
5755                  * Not enough room for our frame pointers -- need to indicate
5756                  * that we ran out of scratch space.
5757                  */
5758                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5759                 return;
5760         }
5761
5762         mstate->dtms_scratch_ptr += size;
5763         saved = mstate->dtms_scratch_ptr;
5764
5765         /*
5766          * Now get a stack with both program counters and frame pointers.
5767          */
5768         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5769         dtrace_getufpstack(buf, fps, nframes + 1);
5770         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5771
5772         /*
5773          * If that faulted, we're cooked.
5774          */
5775         if (*flags & CPU_DTRACE_FAULT)
5776                 goto out;
5777
5778         /*
5779          * Now we want to walk up the stack, calling the USTACK helper.  For
5780          * each iteration, we restore the scratch pointer.
5781          */
5782         for (i = 0; i < nframes; i++) {
5783                 mstate->dtms_scratch_ptr = saved;
5784
5785                 if (offs >= strsize)
5786                         break;
5787
5788                 sym = (char *)(uintptr_t)dtrace_helper(
5789                     DTRACE_HELPER_ACTION_USTACK,
5790                     mstate, state, pcs[i], fps[i]);
5791
5792                 /*
5793                  * If we faulted while running the helper, we're going to
5794                  * clear the fault and null out the corresponding string.
5795                  */
5796                 if (*flags & CPU_DTRACE_FAULT) {
5797                         *flags &= ~CPU_DTRACE_FAULT;
5798                         str[offs++] = '\0';
5799                         continue;
5800                 }
5801
5802                 if (sym == NULL) {
5803                         str[offs++] = '\0';
5804                         continue;
5805                 }
5806
5807                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5808
5809                 /*
5810                  * Now copy in the string that the helper returned to us.
5811                  */
5812                 for (j = 0; offs + j < strsize; j++) {
5813                         if ((str[offs + j] = sym[j]) == '\0')
5814                                 break;
5815                 }
5816
5817                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5818
5819                 offs += j + 1;
5820         }
5821
5822         if (offs >= strsize) {
5823                 /*
5824                  * If we didn't have room for all of the strings, we don't
5825                  * abort processing -- this needn't be a fatal error -- but we
5826                  * still want to increment a counter (dts_stkstroverflows) to
5827                  * allow this condition to be warned about.  (If this is from
5828                  * a jstack() action, it is easily tuned via jstackstrsize.)
5829                  */
5830                 dtrace_error(&state->dts_stkstroverflows);
5831         }
5832
5833         while (offs < strsize)
5834                 str[offs++] = '\0';
5835
5836 out:
5837         mstate->dtms_scratch_ptr = old;
5838 }
5839
5840 /*
5841  * If you're looking for the epicenter of DTrace, you just found it.  This
5842  * is the function called by the provider to fire a probe -- from which all
5843  * subsequent probe-context DTrace activity emanates.
5844  */
5845 static void
5846 __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
5847     uint64_t arg2, uint64_t arg3, uint64_t arg4)
5848 {
5849         processorid_t cpuid;
5850         dtrace_icookie_t cookie;
5851         dtrace_probe_t *probe;
5852         dtrace_mstate_t mstate;
5853         dtrace_ecb_t *ecb;
5854         dtrace_action_t *act;
5855         intptr_t offs;
5856         size_t size;
5857         int vtime, onintr;
5858         volatile uint16_t *flags;
5859         hrtime_t now;
5860
5861         cookie = dtrace_interrupt_disable();
5862         probe = dtrace_probes[id - 1];
5863         cpuid = CPU->cpu_id;
5864         onintr = CPU_ON_INTR(CPU);
5865
5866         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
5867             probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
5868                 /*
5869                  * We have hit in the predicate cache; we know that
5870                  * this predicate would evaluate to be false.
5871                  */
5872                 dtrace_interrupt_enable(cookie);
5873                 return;
5874         }
5875
5876         if (panic_quiesce) {
5877                 /*
5878                  * We don't trace anything if we're panicking.
5879                  */
5880                 dtrace_interrupt_enable(cookie);
5881                 return;
5882         }
5883
5884 #if !defined(__APPLE__)
5885         now = dtrace_gethrtime();
5886         vtime = dtrace_vtime_references != 0;
5887
5888         if (vtime && curthread->t_dtrace_start)
5889                 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
5890 #else
5891         /*
5892          * APPLE NOTE:  The time spent entering DTrace and arriving
5893          * to this point, is attributed to the current thread.
5894          * Instead it should accrue to DTrace.  FIXME
5895          */
5896         vtime = dtrace_vtime_references != 0;
5897
5898         if (vtime)
5899         {
5900                 int64_t dtrace_accum_time, recent_vtime;
5901                 thread_t thread = current_thread();
5902
5903                 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
5904
5905                 if (dtrace_accum_time >= 0) {
5906                         recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
5907
5908                         recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
5909
5910                         dtrace_set_thread_vtime(thread, recent_vtime);
5911                 }
5912         }
5913
5914         now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
5915 #endif /* __APPLE__ */
5916
5917         /*
5918          * APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
5919          * dtrace_probe() in some circumstances.   See, e.g. fasttrap_isa.c.
5920          * However the provider has no access to ECB context, so passes
5921          * 0 through "arg0" and the probe_id of the overridden probe as arg1.
5922          * Detect that here and cons up a viable state (from the probe_id).
5923          */
5924         if (dtrace_probeid_error == id && 0 == arg0) {
5925                 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
5926                 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
5927                 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
5928
5929                 if (NULL != ftp_ecb) {
5930                         dtrace_state_t *ftp_state = ftp_ecb->dte_state;
5931
5932                         arg0 = (uint64_t)(uintptr_t)ftp_state;
5933                         arg1 = ftp_ecb->dte_epid;
5934                         /*
5935                          * args[2-4] established by caller.
5936                          */
5937                         ftp_state->dts_arg_error_illval = -1; /* arg5 */
5938                 }
5939         }
5940
5941         mstate.dtms_difo = NULL;
5942         mstate.dtms_probe = probe;
5943         mstate.dtms_strtok = 0;
5944         mstate.dtms_arg[0] = arg0;
5945         mstate.dtms_arg[1] = arg1;
5946         mstate.dtms_arg[2] = arg2;
5947         mstate.dtms_arg[3] = arg3;
5948         mstate.dtms_arg[4] = arg4;
5949
5950         flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
5951
5952         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
5953                 dtrace_predicate_t *pred = ecb->dte_predicate;
5954                 dtrace_state_t *state = ecb->dte_state;
5955                 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
5956                 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
5957                 dtrace_vstate_t *vstate = &state->dts_vstate;
5958                 dtrace_provider_t *prov = probe->dtpr_provider;
5959                 uint64_t tracememsize = 0;
5960                 int committed = 0;
5961                 caddr_t tomax;
5962
5963                 /*
5964                  * A little subtlety with the following (seemingly innocuous)
5965                  * declaration of the automatic 'val':  by looking at the
5966                  * code, you might think that it could be declared in the
5967                  * action processing loop, below.  (That is, it's only used in
5968                  * the action processing loop.)  However, it must be declared
5969                  * out of that scope because in the case of DIF expression
5970                  * arguments to aggregating actions, one iteration of the
5971                  * action loop will use the last iteration's value.
5972                  */
5973 #ifdef lint
5974                 uint64_t val = 0;
5975 #else
5976                 uint64_t val = 0;
5977 #endif
5978
5979                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
5980                 *flags &= ~CPU_DTRACE_ERROR;
5981
5982                 if (prov == dtrace_provider) {
5983                         /*
5984                          * If dtrace itself is the provider of this probe,
5985                          * we're only going to continue processing the ECB if
5986                          * arg0 (the dtrace_state_t) is equal to the ECB's
5987                          * creating state.  (This prevents disjoint consumers
5988                          * from seeing one another's metaprobes.)
5989                          */
5990                         if (arg0 != (uint64_t)(uintptr_t)state)
5991                                 continue;
5992                 }
5993
5994                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
5995                         /*
5996                          * We're not currently active.  If our provider isn't
5997                          * the dtrace pseudo provider, we're not interested.
5998                          */
5999                         if (prov != dtrace_provider)
6000                                 continue;
6001
6002                         /*
6003                          * Now we must further check if we are in the BEGIN
6004                          * probe.  If we are, we will only continue processing
6005                          * if we're still in WARMUP -- if one BEGIN enabling
6006                          * has invoked the exit() action, we don't want to
6007                          * evaluate subsequent BEGIN enablings.
6008                          */
6009                         if (probe->dtpr_id == dtrace_probeid_begin &&
6010                             state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
6011                                 ASSERT(state->dts_activity ==
6012                                     DTRACE_ACTIVITY_DRAINING);
6013                                 continue;
6014                         }
6015                 }
6016
6017                 if (ecb->dte_cond) {
6018                         /*
6019                          * If the dte_cond bits indicate that this
6020                          * consumer is only allowed to see user-mode firings
6021                          * of this probe, call the provider's dtps_usermode()
6022                          * entry point to check that the probe was fired
6023                          * while in a user context. Skip this ECB if that's
6024                          * not the case.
6025                          */
6026                         if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
6027                             prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
6028                             probe->dtpr_id, probe->dtpr_arg) == 0)
6029                                 continue;
6030
6031                         /*
6032                          * This is more subtle than it looks. We have to be
6033                          * absolutely certain that CRED() isn't going to
6034                          * change out from under us so it's only legit to
6035                          * examine that structure if we're in constrained
6036                          * situations. Currently, the only times we'll this
6037                          * check is if a non-super-user has enabled the
6038                          * profile or syscall providers -- providers that
6039                          * allow visibility of all processes. For the
6040                          * profile case, the check above will ensure that
6041                          * we're examining a user context.
6042                          */
6043                         if (ecb->dte_cond & DTRACE_COND_OWNER) {
6044                                 cred_t *cr;
6045                                 cred_t *s_cr =
6046                                     ecb->dte_state->dts_cred.dcr_cred;
6047                                 proc_t *proc;
6048 #pragma unused(proc) /* __APPLE__ */
6049
6050                                 ASSERT(s_cr != NULL);
6051
6052                         /*
6053                          * XXX this is hackish, but so is setting a variable
6054                          * XXX in a McCarthy OR...
6055                          */
6056                                 if ((cr = dtrace_CRED()) == NULL ||
6057                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
6058                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
6059                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
6060                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
6061                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
6062                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
6063 #if !defined(__APPLE__)
6064                                     (proc = ttoproc(curthread)) == NULL ||
6065                                     (proc->p_flag & SNOCD))
6066 #else
6067                                         1) /* APPLE NOTE: Darwin omits "No Core Dump" flag */
6068 #endif /* __APPLE__ */
6069                                         continue;
6070                         }
6071
6072                         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
6073                                 cred_t *cr;
6074                                 cred_t *s_cr =
6075                                     ecb->dte_state->dts_cred.dcr_cred;
6076 #pragma unused(cr, s_cr) /* __APPLE__ */
6077
6078                                 ASSERT(s_cr != NULL);
6079
6080 #if !defined(__APPLE__)
6081                                 if ((cr = CRED()) == NULL ||
6082                                     s_cr->cr_zone->zone_id !=
6083                                     cr->cr_zone->zone_id)
6084                                         continue;
6085 #else
6086                                 /* APPLE NOTE: Darwin doesn't do zones. */
6087 #endif /* __APPLE__ */
6088                         }
6089                 }
6090
6091                 if (now - state->dts_alive > dtrace_deadman_timeout) {
6092                         /*
6093                          * We seem to be dead.  Unless we (a) have kernel
6094                          * destructive permissions (b) have expicitly enabled
6095                          * destructive actions and (c) destructive actions have
6096                          * not been disabled, we're going to transition into
6097                          * the KILLED state, from which no further processing
6098                          * on this state will be performed.
6099                          */
6100                         if (!dtrace_priv_kernel_destructive(state) ||
6101                             !state->dts_cred.dcr_destructive ||
6102                             dtrace_destructive_disallow) {
6103                                 void *activity = &state->dts_activity;
6104                                 dtrace_activity_t current;
6105
6106                                 do {
6107                                         current = state->dts_activity;
6108                                 } while (dtrace_cas32(activity, current,
6109                                     DTRACE_ACTIVITY_KILLED) != current);
6110
6111                                 continue;
6112                         }
6113                 }
6114
6115                 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6116                     ecb->dte_alignment, state, &mstate)) < 0)
6117                         continue;
6118
6119                 tomax = buf->dtb_tomax;
6120                 ASSERT(tomax != NULL);
6121
6122                 if (ecb->dte_size != 0)
6123                         DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
6124
6125                 mstate.dtms_epid = ecb->dte_epid;
6126                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
6127
6128                 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6129                         mstate.dtms_access = DTRACE_ACCESS_KERNEL;
6130                 else
6131                         mstate.dtms_access = 0;
6132
6133                 if (pred != NULL) {
6134                         dtrace_difo_t *dp = pred->dtp_difo;
6135                         int rval;
6136
6137                         rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6138
6139                         if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6140                                 dtrace_cacheid_t cid = probe->dtpr_predcache;
6141
6142                                 if (cid != DTRACE_CACHEIDNONE && !onintr) {
6143                                         /*
6144                                          * Update the predicate cache...
6145                                          */
6146                                         ASSERT(cid == pred->dtp_cacheid);
6147
6148                                         dtrace_set_thread_predcache(current_thread(), cid);
6149                                 }
6150
6151                                 continue;
6152                         }
6153                 }
6154
6155                 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6156                     act != NULL; act = act->dta_next) {
6157                         size_t valoffs;
6158                         dtrace_difo_t *dp;
6159                         dtrace_recdesc_t *rec = &act->dta_rec;
6160
6161                         size = rec->dtrd_size;
6162                         valoffs = offs + rec->dtrd_offset;
6163
6164                         if (DTRACEACT_ISAGG(act->dta_kind)) {
6165                                 uint64_t v = 0xbad;
6166                                 dtrace_aggregation_t *agg;
6167
6168                                 agg = (dtrace_aggregation_t *)act;
6169
6170                                 if ((dp = act->dta_difo) != NULL)
6171                                         v = dtrace_dif_emulate(dp,
6172                                             &mstate, vstate, state);
6173
6174                                 if (*flags & CPU_DTRACE_ERROR)
6175                                         continue;
6176
6177                                 /*
6178                                  * Note that we always pass the expression
6179                                  * value from the previous iteration of the
6180                                  * action loop.  This value will only be used
6181                                  * if there is an expression argument to the
6182                                  * aggregating action, denoted by the
6183                                  * dtag_hasarg field.
6184                                  */
6185                                 dtrace_aggregate(agg, buf,
6186                                     offs, aggbuf, v, val);
6187                                 continue;
6188                         }
6189
6190                         switch (act->dta_kind) {
6191                         case DTRACEACT_STOP:
6192                                 if (dtrace_priv_proc_destructive(state))
6193                                         dtrace_action_stop();
6194                                 continue;
6195
6196                         case DTRACEACT_BREAKPOINT:
6197                                 if (dtrace_priv_kernel_destructive(state))
6198                                         dtrace_action_breakpoint(ecb);
6199                                 continue;
6200
6201                         case DTRACEACT_PANIC:
6202                                 if (dtrace_priv_kernel_destructive(state))
6203                                         dtrace_action_panic(ecb);
6204                                 continue;
6205
6206                         case DTRACEACT_STACK:
6207                                 if (!dtrace_priv_kernel(state))
6208                                         continue;
6209
6210                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6211                                     size / sizeof (pc_t), probe->dtpr_aframes,
6212                                     DTRACE_ANCHORED(probe) ? NULL :
6213                                   (uint32_t *)(uintptr_t)arg0);
6214                                 continue;
6215
6216                         case DTRACEACT_JSTACK:
6217                         case DTRACEACT_USTACK:
6218                                 if (!dtrace_priv_proc(state))
6219                                         continue;
6220
6221                                 /*
6222                                  * See comment in DIF_VAR_PID.
6223                                  */
6224                                 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6225                                     CPU_ON_INTR(CPU)) {
6226                                         int depth = DTRACE_USTACK_NFRAMES(
6227                                             rec->dtrd_arg) + 1;
6228
6229                                         dtrace_bzero((void *)(tomax + valoffs),
6230                                             DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6231                                             + depth * sizeof (uint64_t));
6232
6233                                         continue;
6234                                 }
6235
6236                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6237                                     curproc->p_dtrace_helpers != NULL) {
6238                                         /*
6239                                          * This is the slow path -- we have
6240                                          * allocated string space, and we're
6241                                          * getting the stack of a process that
6242                                          * has helpers.  Call into a separate
6243                                          * routine to perform this processing.
6244                                          */
6245                                         dtrace_action_ustack(&mstate, state,
6246                                             (uint64_t *)(tomax + valoffs),
6247                                             rec->dtrd_arg);
6248                                         continue;
6249                                 }
6250
6251                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6252                                 dtrace_getupcstack((uint64_t *)
6253                                     (tomax + valoffs),
6254                                     DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6255                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6256                                 continue;
6257
6258                         default:
6259                                 break;
6260                         }
6261
6262                         dp = act->dta_difo;
6263                         ASSERT(dp != NULL);
6264
6265                         val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6266
6267                         if (*flags & CPU_DTRACE_ERROR)
6268                                 continue;
6269
6270                         switch (act->dta_kind) {
6271                         case DTRACEACT_SPECULATE:
6272                                 ASSERT(buf == &state->dts_buffer[cpuid]);
6273                                 buf = dtrace_speculation_buffer(state,
6274                                     cpuid, val);
6275
6276                                 if (buf == NULL) {
6277                                         *flags |= CPU_DTRACE_DROP;
6278                                         continue;
6279                                 }
6280
6281                                 offs = dtrace_buffer_reserve(buf,
6282                                     ecb->dte_needed, ecb->dte_alignment,
6283                                     state, NULL);
6284
6285                                 if (offs < 0) {
6286                                         *flags |= CPU_DTRACE_DROP;
6287                                         continue;
6288                                 }
6289
6290                                 tomax = buf->dtb_tomax;
6291                                 ASSERT(tomax != NULL);
6292
6293                                 if (ecb->dte_size != 0)
6294                                         DTRACE_STORE(uint32_t, tomax, offs,
6295                                             ecb->dte_epid);
6296                                 continue;
6297
6298                         case DTRACEACT_CHILL:
6299                                 if (dtrace_priv_kernel_destructive(state))
6300                                         dtrace_action_chill(&mstate, val);
6301                                 continue;
6302
6303                         case DTRACEACT_RAISE:
6304                                 if (dtrace_priv_proc_destructive(state))
6305                                         dtrace_action_raise(val);
6306                                 continue;
6307
6308                         case DTRACEACT_PIDRESUME:   /* __APPLE__ */
6309                                 if (dtrace_priv_proc_destructive(state))
6310                                         dtrace_action_pidresume(val);
6311                                 continue;
6312
6313                         case DTRACEACT_COMMIT:
6314                                 ASSERT(!committed);
6315
6316                                 /*
6317                                  * We need to commit our buffer state.
6318                                  */
6319                                 if (ecb->dte_size)
6320                                         buf->dtb_offset = offs + ecb->dte_size;
6321                                 buf = &state->dts_buffer[cpuid];
6322                                 dtrace_speculation_commit(state, cpuid, val);
6323                                 committed = 1;
6324                                 continue;
6325
6326                         case DTRACEACT_DISCARD:
6327                                 dtrace_speculation_discard(state, cpuid, val);
6328                                 continue;
6329
6330                         case DTRACEACT_DIFEXPR:
6331                         case DTRACEACT_LIBACT:
6332                         case DTRACEACT_PRINTF:
6333                         case DTRACEACT_PRINTA:
6334                         case DTRACEACT_SYSTEM:
6335                         case DTRACEACT_FREOPEN:
6336                         case DTRACEACT_APPLEBINARY:   /* __APPLE__ */
6337                         case DTRACEACT_TRACEMEM:
6338                                 break;
6339
6340                         case DTRACEACT_TRACEMEM_DYNSIZE:
6341                                 tracememsize = val;
6342                                 break;
6343
6344                         case DTRACEACT_SYM:
6345                         case DTRACEACT_MOD:
6346                                 if (!dtrace_priv_kernel(state))
6347                                         continue;
6348                                 break;
6349
6350                         case DTRACEACT_USYM:
6351                         case DTRACEACT_UMOD:
6352                         case DTRACEACT_UADDR: {
6353                                 if (!dtrace_priv_proc(state))
6354                                         continue;
6355
6356                                 DTRACE_STORE(uint64_t, tomax,
6357                                     valoffs, (uint64_t)dtrace_proc_selfpid());
6358                                 DTRACE_STORE(uint64_t, tomax,
6359                                     valoffs + sizeof (uint64_t), val);
6360
6361                                 continue;
6362                         }
6363
6364                         case DTRACEACT_EXIT: {
6365                                 /*
6366                                  * For the exit action, we are going to attempt
6367                                  * to atomically set our activity to be
6368                                  * draining.  If this fails (either because
6369                                  * another CPU has beat us to the exit action,
6370                                  * or because our current activity is something
6371                                  * other than ACTIVE or WARMUP), we will
6372                                  * continue.  This assures that the exit action
6373                                  * can be successfully recorded at most once
6374                                  * when we're in the ACTIVE state.  If we're
6375                                  * encountering the exit() action while in
6376                                  * COOLDOWN, however, we want to honor the new
6377                                  * status code.  (We know that we're the only
6378                                  * thread in COOLDOWN, so there is no race.)
6379                                  */
6380                                 void *activity = &state->dts_activity;
6381                                 dtrace_activity_t current = state->dts_activity;
6382
6383                                 if (current == DTRACE_ACTIVITY_COOLDOWN)
6384                                         break;
6385
6386                                 if (current != DTRACE_ACTIVITY_WARMUP)
6387                                         current = DTRACE_ACTIVITY_ACTIVE;
6388
6389                                 if (dtrace_cas32(activity, current,
6390                                     DTRACE_ACTIVITY_DRAINING) != current) {
6391                                         *flags |= CPU_DTRACE_DROP;
6392                                         continue;
6393                                 }
6394
6395                                 break;
6396                         }
6397
6398                         default:
6399                                 ASSERT(0);
6400                         }
6401
6402                         if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
6403                                 uintptr_t end = valoffs + size;
6404
6405                                 if (tracememsize != 0 &&
6406                                     valoffs + tracememsize < end)
6407                                 {
6408                                         end = valoffs + tracememsize;
6409                                         tracememsize = 0;
6410                                 }
6411
6412                                 if (!dtrace_vcanload((void *)(uintptr_t)val,
6413                                     &dp->dtdo_rtype, &mstate, vstate))
6414                                         continue;
6415
6416                                 /*
6417                                  * If this is a string, we're going to only
6418                                  * load until we find the zero byte -- after
6419                                  * which we'll store zero bytes.
6420                                  */
6421                                 if (dp->dtdo_rtype.dtdt_kind ==
6422                                     DIF_TYPE_STRING) {
6423                                         char c = '\0' + 1;
6424                                         int intuple = act->dta_intuple;
6425                                         size_t s;
6426
6427                                         for (s = 0; s < size; s++) {
6428                                                 if (c != '\0')
6429                                                         c = dtrace_load8(val++);
6430
6431                                                 DTRACE_STORE(uint8_t, tomax,
6432                                                     valoffs++, c);
6433
6434                                                 if (c == '\0' && intuple)
6435                                                         break;
6436                                         }
6437
6438                                         continue;
6439                                 }
6440
6441                                 while (valoffs < end) {
6442                                         DTRACE_STORE(uint8_t, tomax, valoffs++,
6443                                             dtrace_load8(val++));
6444                                 }
6445
6446                                 continue;
6447                         }
6448
6449                         switch (size) {
6450                         case 0:
6451                                 break;
6452
6453                         case sizeof (uint8_t):
6454                                 DTRACE_STORE(uint8_t, tomax, valoffs, val);
6455                                 break;
6456                         case sizeof (uint16_t):
6457                                 DTRACE_STORE(uint16_t, tomax, valoffs, val);
6458                                 break;
6459                         case sizeof (uint32_t):
6460                                 DTRACE_STORE(uint32_t, tomax, valoffs, val);
6461                                 break;
6462                         case sizeof (uint64_t):
6463                                 DTRACE_STORE(uint64_t, tomax, valoffs, val);
6464                                 break;
6465                         default:
6466                                 /*
6467                                  * Any other size should have been returned by
6468                                  * reference, not by value.
6469                                  */
6470                                 ASSERT(0);
6471                                 break;
6472                         }
6473                 }
6474
6475                 if (*flags & CPU_DTRACE_DROP)
6476                         continue;
6477
6478                 if (*flags & CPU_DTRACE_FAULT) {
6479                         int ndx;
6480                         dtrace_action_t *err;
6481
6482                         buf->dtb_errors++;
6483
6484                         if (probe->dtpr_id == dtrace_probeid_error) {
6485                                 /*
6486                                  * There's nothing we can do -- we had an
6487                                  * error on the error probe.  We bump an
6488                                  * error counter to at least indicate that
6489                                  * this condition happened.
6490                                  */
6491                                 dtrace_error(&state->dts_dblerrors);
6492                                 continue;
6493                         }
6494
6495                         if (vtime) {
6496                                 /*
6497                                  * Before recursing on dtrace_probe(), we
6498                                  * need to explicitly clear out our start
6499                                  * time to prevent it from being accumulated
6500                                  * into t_dtrace_vtime.
6501                                  */
6502
6503                                 /*
6504                                  * Darwin sets the sign bit on t_dtrace_tracing
6505                                  * to suspend accumulation to it.
6506                                  */
6507                                 dtrace_set_thread_tracing(current_thread(),
6508                                     (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
6509
6510                         }
6511
6512                         /*
6513                          * Iterate over the actions to figure out which action
6514                          * we were processing when we experienced the error.
6515                          * Note that act points _past_ the faulting action; if
6516                          * act is ecb->dte_action, the fault was in the
6517                          * predicate, if it's ecb->dte_action->dta_next it's
6518                          * in action #1, and so on.
6519                          */
6520                         for (err = ecb->dte_action, ndx = 0;
6521                             err != act; err = err->dta_next, ndx++)
6522                                 continue;
6523
6524                         dtrace_probe_error(state, ecb->dte_epid, ndx,
6525                             (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6526                             mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6527                             cpu_core[cpuid].cpuc_dtrace_illval);
6528
6529                         continue;
6530                 }
6531
6532                 if (!committed)
6533                         buf->dtb_offset = offs + ecb->dte_size;
6534         }
6535
6536         /* FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed
6537            to the current thread. Instead it should accrue to DTrace. */
6538         if (vtime) {
6539                 thread_t thread = current_thread();
6540                 int64_t t = dtrace_get_thread_tracing(thread);
6541
6542                 if (t >= 0) {
6543                         /* Usual case, accumulate time spent here into t_dtrace_tracing */
6544                         dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
6545                 } else {
6546                         /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
6547                         dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
6548                 }
6549         }
6550
6551         dtrace_interrupt_enable(cookie);
6552 }
6553
6554 /*
6555  * APPLE NOTE:  Don't allow a thread to re-enter dtrace_probe().
6556  * This could occur if a probe is encountered on some function in the
6557  * transitive closure of the call to dtrace_probe().
6558  * Solaris has some strong guarantees that this won't happen.
6559  * The Darwin implementation is not so mature as to make those guarantees.
6560  * Hence, the introduction of __dtrace_probe() on xnu.
6561  */
6562
6563 void
6564 dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6565     uint64_t arg2, uint64_t arg3, uint64_t arg4)
6566 {
6567         thread_t thread = current_thread();
6568         disable_preemption();
6569         if (id == dtrace_probeid_error) {
6570                 __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
6571                 dtrace_getipl(); /* Defeat tail-call optimization of __dtrace_probe() */
6572         } else if (!dtrace_get_thread_reentering(thread)) {
6573                 dtrace_set_thread_reentering(thread, TRUE);
6574                 __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
6575                 dtrace_set_thread_reentering(thread, FALSE);
6576         }
6577 #if DEBUG
6578         else __dtrace_probe(dtrace_probeid_error, 0, id, 1, -1, DTRACEFLT_UNKNOWN);
6579 #endif
6580         enable_preemption();
6581 }
6582
6583 /*
6584  * DTrace Probe Hashing Functions
6585  *
6586  * The functions in this section (and indeed, the functions in remaining
6587  * sections) are not _called_ from probe context.  (Any exceptions to this are
6588  * marked with a "Note:".)  Rather, they are called from elsewhere in the
6589  * DTrace framework to look-up probes in, add probes to and remove probes from
6590  * the DTrace probe hashes.  (Each probe is hashed by each element of the
6591  * probe tuple -- allowing for fast lookups, regardless of what was
6592  * specified.)
6593  */
6594 static uint_t
6595 dtrace_hash_str(const char *p)
6596 {
6597         unsigned int g;
6598         uint_t hval = 0;
6599
6600         while (*p) {
6601                 hval = (hval << 4) + *p++;
6602                 if ((g = (hval & 0xf0000000)) != 0)
6603                         hval ^= g >> 24;
6604                 hval &= ~g;
6605         }
6606         return (hval);
6607 }
6608
6609 static dtrace_hash_t *
6610 dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6611 {
6612         dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6613
6614         hash->dth_stroffs = stroffs;
6615         hash->dth_nextoffs = nextoffs;
6616         hash->dth_prevoffs = prevoffs;
6617
6618         hash->dth_size = 1;
6619         hash->dth_mask = hash->dth_size - 1;
6620
6621         hash->dth_tab = kmem_zalloc(hash->dth_size *
6622             sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6623
6624         return (hash);
6625 }
6626
6627 /*
6628  * APPLE NOTE: dtrace_hash_destroy is not used.
6629  * It is called by dtrace_detach which is not
6630  * currently implemented.  Revisit someday.
6631  */
6632 #if !defined(__APPLE__)
6633 static void
6634 dtrace_hash_destroy(dtrace_hash_t *hash)
6635 {
6636 #if DEBUG
6637         int i;
6638
6639         for (i = 0; i < hash->dth_size; i++)
6640                 ASSERT(hash->dth_tab[i] == NULL);
6641 #endif
6642
6643         kmem_free(hash->dth_tab,
6644             hash->dth_size * sizeof (dtrace_hashbucket_t *));
6645         kmem_free(hash, sizeof (dtrace_hash_t));
6646 }
6647 #endif /* __APPLE__ */
6648
6649 static void
6650 dtrace_hash_resize(dtrace_hash_t *hash)
6651 {
6652         int size = hash->dth_size, i, ndx;
6653         int new_size = hash->dth_size << 1;
6654         int new_mask = new_size - 1;
6655         dtrace_hashbucket_t **new_tab, *bucket, *next;
6656
6657         ASSERT((new_size & new_mask) == 0);
6658
6659         new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6660
6661         for (i = 0; i < size; i++) {
6662                 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6663                         dtrace_probe_t *probe = bucket->dthb_chain;
6664
6665                         ASSERT(probe != NULL);
6666                         ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6667
6668                         next = bucket->dthb_next;
6669                         bucket->dthb_next = new_tab[ndx];
6670                         new_tab[ndx] = bucket;
6671                 }
6672         }
6673
6674         kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6675         hash->dth_tab = new_tab;
6676         hash->dth_size = new_size;
6677         hash->dth_mask = new_mask;
6678 }
6679
6680 static void
6681 dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6682 {
6683         int hashval = DTRACE_HASHSTR(hash, new);
6684         int ndx = hashval & hash->dth_mask;
6685         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6686         dtrace_probe_t **nextp, **prevp;
6687
6688         for (; bucket != NULL; bucket = bucket->dthb_next) {
6689                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6690                         goto add;
6691         }
6692
6693         if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6694                 dtrace_hash_resize(hash);
6695                 dtrace_hash_add(hash, new);
6696                 return;
6697         }
6698
6699         bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6700         bucket->dthb_next = hash->dth_tab[ndx];
6701         hash->dth_tab[ndx] = bucket;
6702         hash->dth_nbuckets++;
6703
6704 add:
6705         nextp = DTRACE_HASHNEXT(hash, new);
6706         ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6707         *nextp = bucket->dthb_chain;
6708
6709         if (bucket->dthb_chain != NULL) {
6710                 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6711                 ASSERT(*prevp == NULL);
6712                 *prevp = new;
6713         }
6714
6715         bucket->dthb_chain = new;
6716         bucket->dthb_len++;
6717 }
6718
6719 static dtrace_probe_t *
6720 dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6721 {
6722         int hashval = DTRACE_HASHSTR(hash, template);
6723         int ndx = hashval & hash->dth_mask;
6724         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6725
6726         for (; bucket != NULL; bucket = bucket->dthb_next) {
6727                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6728                         return (bucket->dthb_chain);
6729         }
6730
6731         return (NULL);
6732 }
6733
6734 static int
6735 dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6736 {
6737         int hashval = DTRACE_HASHSTR(hash, template);
6738         int ndx = hashval & hash->dth_mask;
6739         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6740
6741         for (; bucket != NULL; bucket = bucket->dthb_next) {
6742                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6743                         return (bucket->dthb_len);
6744         }
6745
6746         return (0);
6747 }
6748
6749 static void
6750 dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6751 {
6752         int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6753         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6754
6755         dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6756         dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6757
6758         /*
6759          * Find the bucket that we're removing this probe from.
6760          */
6761         for (; bucket != NULL; bucket = bucket->dthb_next) {
6762                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6763                         break;
6764         }
6765
6766         ASSERT(bucket != NULL);
6767
6768         if (*prevp == NULL) {
6769                 if (*nextp == NULL) {
6770                         /*
6771                          * The removed probe was the only probe on this
6772                          * bucket; we need to remove the bucket.
6773                          */
6774                         dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6775
6776                         ASSERT(bucket->dthb_chain == probe);
6777                         ASSERT(b != NULL);
6778
6779                         if (b == bucket) {
6780                                 hash->dth_tab[ndx] = bucket->dthb_next;
6781                         } else {
6782                                 while (b->dthb_next != bucket)
6783                                         b = b->dthb_next;
6784                                 b->dthb_next = bucket->dthb_next;
6785                         }
6786
6787                         ASSERT(hash->dth_nbuckets > 0);
6788                         hash->dth_nbuckets--;
6789                         kmem_free(bucket, sizeof (dtrace_hashbucket_t));
6790                         return;
6791                 }
6792
6793                 bucket->dthb_chain = *nextp;
6794         } else {
6795                 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
6796         }
6797
6798         if (*nextp != NULL)
6799                 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
6800 }
6801
6802 /*
6803  * DTrace Utility Functions
6804  *
6805  * These are random utility functions that are _not_ called from probe context.
6806  */
6807 static int
6808 dtrace_badattr(const dtrace_attribute_t *a)
6809 {
6810         return (a->dtat_name > DTRACE_STABILITY_MAX ||
6811             a->dtat_data > DTRACE_STABILITY_MAX ||
6812             a->dtat_class > DTRACE_CLASS_MAX);
6813 }
6814
6815 /*
6816  * Return a duplicate copy of a string.  If the specified string is NULL,
6817  * this function returns a zero-length string.
6818  * APPLE NOTE: Darwin employs size bounded string operation.
6819  */
6820 static char *
6821 dtrace_strdup(const char *str)
6822 {
6823         size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
6824         char *new = kmem_zalloc(bufsize, KM_SLEEP);
6825
6826         if (str != NULL)
6827                 (void) strlcpy(new, str, bufsize);
6828
6829         return (new);
6830 }
6831
6832 #define DTRACE_ISALPHA(c)       \
6833         (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
6834
6835 static int
6836 dtrace_badname(const char *s)
6837 {
6838         char c;
6839
6840         if (s == NULL || (c = *s++) == '\0')
6841                 return (0);
6842
6843         if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
6844                 return (1);
6845
6846         while ((c = *s++) != '\0') {
6847                 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
6848                     c != '-' && c != '_' && c != '.' && c != '`')
6849                         return (1);
6850         }
6851
6852         return (0);
6853 }
6854
6855 static void
6856 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
6857 {
6858         uint32_t priv;
6859
6860         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
6861                 /*
6862                  * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
6863                  */
6864                 priv = DTRACE_PRIV_ALL;
6865         } else {
6866                 *uidp = crgetuid(cr);
6867                 *zoneidp = crgetzoneid(cr);
6868
6869                 priv = 0;
6870                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
6871                         priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
6872                 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
6873                         priv |= DTRACE_PRIV_USER;
6874                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
6875                         priv |= DTRACE_PRIV_PROC;
6876                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
6877                         priv |= DTRACE_PRIV_OWNER;
6878                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
6879                         priv |= DTRACE_PRIV_ZONEOWNER;
6880         }
6881
6882         *privp = priv;
6883 }
6884
6885 #ifdef DTRACE_ERRDEBUG
6886 static void
6887 dtrace_errdebug(const char *str)
6888 {
6889         int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
6890         int occupied = 0;
6891
6892         lck_mtx_lock(&dtrace_errlock);
6893         dtrace_errlast = str;
6894         dtrace_errthread = (kthread_t *)current_thread();
6895
6896         while (occupied++ < DTRACE_ERRHASHSZ) {
6897                 if (dtrace_errhash[hval].dter_msg == str) {
6898                         dtrace_errhash[hval].dter_count++;
6899                         goto out;
6900                 }
6901
6902                 if (dtrace_errhash[hval].dter_msg != NULL) {
6903                         hval = (hval + 1) % DTRACE_ERRHASHSZ;
6904                         continue;
6905                 }
6906
6907                 dtrace_errhash[hval].dter_msg = str;
6908                 dtrace_errhash[hval].dter_count = 1;
6909                 goto out;
6910         }
6911
6912         panic("dtrace: undersized error hash");
6913 out:
6914         lck_mtx_unlock(&dtrace_errlock);
6915 }
6916 #endif
6917
6918 /*
6919  * DTrace Matching Functions
6920  *
6921  * These functions are used to match groups of probes, given some elements of
6922  * a probe tuple, or some globbed expressions for elements of a probe tuple.
6923  */
6924 static int
6925 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
6926     zoneid_t zoneid)
6927 {
6928         if (priv != DTRACE_PRIV_ALL) {
6929                 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
6930                 uint32_t match = priv & ppriv;
6931
6932                 /*
6933                  * No PRIV_DTRACE_* privileges...
6934                  */
6935                 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
6936                     DTRACE_PRIV_KERNEL)) == 0)
6937                         return (0);
6938
6939                 /*
6940                  * No matching bits, but there were bits to match...
6941                  */
6942                 if (match == 0 && ppriv != 0)
6943                         return (0);
6944
6945                 /*
6946                  * Need to have permissions to the process, but don't...
6947                  */
6948                 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
6949                     uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
6950                         return (0);
6951                 }
6952
6953                 /*
6954                  * Need to be in the same zone unless we possess the
6955                  * privilege to examine all zones.
6956                  */
6957                 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
6958                     zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
6959                         return (0);
6960                 }
6961         }
6962
6963         return (1);
6964 }
6965
6966 /*
6967  * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
6968  * consists of input pattern strings and an ops-vector to evaluate them.
6969  * This function returns >0 for match, 0 for no match, and <0 for error.
6970  */
6971 static int
6972 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
6973     uint32_t priv, uid_t uid, zoneid_t zoneid)
6974 {
6975         dtrace_provider_t *pvp = prp->dtpr_provider;
6976         int rv;
6977
6978         if (pvp->dtpv_defunct)
6979                 return (0);
6980
6981         if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
6982                 return (rv);
6983
6984         if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
6985                 return (rv);
6986
6987         if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
6988                 return (rv);
6989
6990         if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
6991                 return (rv);
6992
6993         if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
6994                 return (0);
6995
6996         return (rv);
6997 }
6998
6999 /*
7000  * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
7001  * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
7002  * libc's version, the kernel version only applies to 8-bit ASCII strings.
7003  * In addition, all of the recursion cases except for '*' matching have been
7004  * unwound.  For '*', we still implement recursive evaluation, but a depth
7005  * counter is maintained and matching is aborted if we recurse too deep.
7006  * The function returns 0 if no match, >0 if match, and <0 if recursion error.
7007  */
7008 static int
7009 dtrace_match_glob(const char *s, const char *p, int depth)
7010 {
7011         const char *olds;
7012         char s1, c;
7013         int gs;
7014
7015         if (depth > DTRACE_PROBEKEY_MAXDEPTH)
7016                 return (-1);
7017
7018         if (s == NULL)
7019                 s = ""; /* treat NULL as empty string */
7020
7021 top:
7022         olds = s;
7023         s1 = *s++;
7024
7025         if (p == NULL)
7026                 return (0);
7027
7028         if ((c = *p++) == '\0')
7029                 return (s1 == '\0');
7030
7031         switch (c) {
7032         case '[': {
7033                 int ok = 0, notflag = 0;
7034                 char lc = '\0';
7035
7036                 if (s1 == '\0')
7037                         return (0);
7038
7039                 if (*p == '!') {
7040                         notflag = 1;
7041                         p++;
7042                 }
7043
7044                 if ((c = *p++) == '\0')
7045                         return (0);
7046
7047                 do {
7048                         if (c == '-' && lc != '\0' && *p != ']') {
7049                                 if ((c = *p++) == '\0')
7050                                         return (0);
7051                                 if (c == '\\' && (c = *p++) == '\0')
7052                                         return (0);
7053
7054                                 if (notflag) {
7055                                         if (s1 < lc || s1 > c)
7056                                                 ok++;
7057                                         else
7058                                                 return (0);
7059                                 } else if (lc <= s1 && s1 <= c)
7060                                         ok++;
7061
7062                         } else if (c == '\\' && (c = *p++) == '\0')
7063                                 return (0);
7064
7065                         lc = c; /* save left-hand 'c' for next iteration */
7066
7067                         if (notflag) {
7068                                 if (s1 != c)
7069                                         ok++;
7070                                 else
7071                                         return (0);
7072                         } else if (s1 == c)
7073                                 ok++;
7074
7075                         if ((c = *p++) == '\0')
7076                                 return (0);
7077
7078                 } while (c != ']');
7079
7080                 if (ok)
7081                         goto top;
7082
7083                 return (0);
7084         }
7085
7086         case '\\':
7087                 if ((c = *p++) == '\0')
7088                         return (0);
7089                 /*FALLTHRU*/
7090
7091         default:
7092                 if (c != s1)
7093                         return (0);
7094                 /*FALLTHRU*/
7095
7096         case '?':
7097                 if (s1 != '\0')
7098                         goto top;
7099                 return (0);
7100
7101         case '*':
7102                 while (*p == '*')
7103                         p++; /* consecutive *'s are identical to a single one */
7104
7105                 if (*p == '\0')
7106                         return (1);
7107
7108                 for (s = olds; *s != '\0'; s++) {
7109                         if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
7110                                 return (gs);
7111                 }
7112
7113                 return (0);
7114         }
7115 }
7116
7117 /*ARGSUSED*/
7118 static int
7119 dtrace_match_string(const char *s, const char *p, int depth)
7120 {
7121 #pragma unused(depth) /* __APPLE__ */
7122
7123         /* APPLE NOTE: Darwin employs size bounded string operation. */
7124         return (s != NULL && strncmp(s, p, strlen(s) + 1) == 0);
7125 }
7126
7127 /*ARGSUSED*/
7128 static int
7129 dtrace_match_nul(const char *s, const char *p, int depth)
7130 {
7131 #pragma unused(s, p, depth) /* __APPLE__ */
7132         return (1); /* always match the empty pattern */
7133 }
7134
7135 /*ARGSUSED*/
7136 static int
7137 dtrace_match_nonzero(const char *s, const char *p, int depth)
7138 {
7139 #pragma unused(p, depth) /* __APPLE__ */
7140         return (s != NULL && s[0] != '\0');
7141 }
7142
7143 static int
7144 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7145     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
7146 {
7147         dtrace_probe_t template, *probe;
7148         dtrace_hash_t *hash = NULL;
7149         int len, rc, best = INT_MAX, nmatched = 0;
7150         dtrace_id_t i;
7151
7152         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7153
7154         /*
7155          * If the probe ID is specified in the key, just lookup by ID and
7156          * invoke the match callback once if a matching probe is found.
7157          */
7158         if (pkp->dtpk_id != DTRACE_IDNONE) {
7159                 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7160                     dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
7161                         if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
7162                                return (DTRACE_MATCH_FAIL);
7163                         nmatched++;
7164                 }
7165                 return (nmatched);
7166         }
7167
7168         template.dtpr_mod =  (char *)(uintptr_t)pkp->dtpk_mod;
7169         template.dtpr_func = (char *)(uintptr_t)pkp->dtpk_func;
7170         template.dtpr_name = (char *)(uintptr_t)pkp->dtpk_name;
7171
7172         /*
7173          * We want to find the most distinct of the module name, function
7174          * name, and name.  So for each one that is not a glob pattern or
7175          * empty string, we perform a lookup in the corresponding hash and
7176          * use the hash table with the fewest collisions to do our search.
7177          */
7178         if (pkp->dtpk_mmatch == &dtrace_match_string &&
7179             (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7180                 best = len;
7181                 hash = dtrace_bymod;
7182         }
7183
7184         if (pkp->dtpk_fmatch == &dtrace_match_string &&
7185             (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7186                 best = len;
7187                 hash = dtrace_byfunc;
7188         }
7189
7190         if (pkp->dtpk_nmatch == &dtrace_match_string &&
7191             (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7192                 best = len;
7193                 hash = dtrace_byname;
7194         }
7195
7196         /*
7197          * If we did not select a hash table, iterate over every probe and
7198          * invoke our callback for each one that matches our input probe key.
7199          */
7200         if (hash == NULL) {
7201                 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
7202                         if ((probe = dtrace_probes[i]) == NULL ||
7203                             dtrace_match_probe(probe, pkp, priv, uid,
7204                             zoneid) <= 0)
7205                                 continue;
7206
7207                         nmatched++;
7208
7209                        if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7210                                if (rc == DTRACE_MATCH_FAIL)
7211                                        return (DTRACE_MATCH_FAIL);
7212                                break;
7213                        }
7214                 }
7215
7216                 return (nmatched);
7217         }
7218
7219         /*
7220          * If we selected a hash table, iterate over each probe of the same key
7221          * name and invoke the callback for every probe that matches the other
7222          * attributes of our input probe key.
7223          */
7224         for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7225             probe = *(DTRACE_HASHNEXT(hash, probe))) {
7226
7227                 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7228                         continue;
7229
7230                 nmatched++;
7231
7232                 if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7233                     if (rc == DTRACE_MATCH_FAIL)
7234                         return (DTRACE_MATCH_FAIL);
7235                     break;
7236                 }
7237         }
7238
7239         return (nmatched);
7240 }
7241
7242 /*
7243  * Return the function pointer dtrace_probecmp() should use to compare the
7244  * specified pattern with a string.  For NULL or empty patterns, we select
7245  * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
7246  * For non-empty non-glob strings, we use dtrace_match_string().
7247  */
7248 static dtrace_probekey_f *
7249 dtrace_probekey_func(const char *p)
7250 {
7251         char c;
7252
7253         if (p == NULL || *p == '\0')
7254                 return (&dtrace_match_nul);
7255
7256         while ((c = *p++) != '\0') {
7257                 if (c == '[' || c == '?' || c == '*' || c == '\\')
7258                         return (&dtrace_match_glob);
7259         }
7260
7261         return (&dtrace_match_string);
7262 }
7263
7264 /*
7265  * Build a probe comparison key for use with dtrace_match_probe() from the
7266  * given probe description.  By convention, a null key only matches anchored
7267  * probes: if each field is the empty string, reset dtpk_fmatch to
7268  * dtrace_match_nonzero().
7269  */
7270 static void
7271 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7272 {
7273         pkp->dtpk_prov = pdp->dtpd_provider;
7274         pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7275
7276         pkp->dtpk_mod = pdp->dtpd_mod;
7277         pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7278
7279         pkp->dtpk_func = pdp->dtpd_func;
7280         pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7281
7282         pkp->dtpk_name = pdp->dtpd_name;
7283         pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7284
7285         pkp->dtpk_id = pdp->dtpd_id;
7286
7287         if (pkp->dtpk_id == DTRACE_IDNONE &&
7288             pkp->dtpk_pmatch == &dtrace_match_nul &&
7289             pkp->dtpk_mmatch == &dtrace_match_nul &&
7290             pkp->dtpk_fmatch == &dtrace_match_nul &&
7291             pkp->dtpk_nmatch == &dtrace_match_nul)
7292                 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7293 }
7294
7295 /*
7296  * DTrace Provider-to-Framework API Functions
7297  *
7298  * These functions implement much of the Provider-to-Framework API, as
7299  * described in <sys/dtrace.h>.  The parts of the API not in this section are
7300  * the functions in the API for probe management (found below), and
7301  * dtrace_probe() itself (found above).
7302  */
7303
7304 /*
7305  * Register the calling provider with the DTrace framework.  This should
7306  * generally be called by DTrace providers in their attach(9E) entry point.
7307  */
7308 int
7309 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7310     cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7311 {
7312         dtrace_provider_t *provider;
7313
7314         if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7315                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7316                     "arguments", name ? name : "<NULL>");
7317                 return (EINVAL);
7318         }
7319
7320         if (name[0] == '\0' || dtrace_badname(name)) {
7321                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7322                     "provider name", name);
7323                 return (EINVAL);
7324         }
7325
7326         if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7327             pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7328             pops->dtps_destroy == NULL ||
7329             ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7330                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7331                     "provider ops", name);
7332                 return (EINVAL);
7333         }
7334
7335         if (dtrace_badattr(&pap->dtpa_provider) ||
7336             dtrace_badattr(&pap->dtpa_mod) ||
7337             dtrace_badattr(&pap->dtpa_func) ||
7338             dtrace_badattr(&pap->dtpa_name) ||
7339             dtrace_badattr(&pap->dtpa_args)) {
7340                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7341                     "provider attributes", name);
7342                 return (EINVAL);
7343         }
7344
7345         if (priv & ~DTRACE_PRIV_ALL) {
7346                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7347                     "privilege attributes", name);
7348                 return (EINVAL);
7349         }
7350
7351         if ((priv & DTRACE_PRIV_KERNEL) &&
7352             (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7353             pops->dtps_usermode == NULL) {
7354                 cmn_err(CE_WARN, "failed to register provider '%s': need "
7355                     "dtps_usermode() op for given privilege attributes", name);
7356                 return (EINVAL);
7357         }
7358
7359         provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7360
7361         /* APPLE NOTE: Darwin employs size bounded string operation. */
7362         {
7363         size_t bufsize = strlen(name) + 1;
7364         provider->dtpv_name = kmem_alloc(bufsize, KM_SLEEP);
7365         (void) strlcpy(provider->dtpv_name, name, bufsize);
7366         }
7367
7368         provider->dtpv_attr = *pap;
7369         provider->dtpv_priv.dtpp_flags = priv;
7370         if (cr != NULL) {
7371                 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7372                 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7373         }
7374         provider->dtpv_pops = *pops;
7375
7376         if (pops->dtps_provide == NULL) {
7377                 ASSERT(pops->dtps_provide_module != NULL);
7378                 provider->dtpv_pops.dtps_provide =
7379                     (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
7380         }
7381
7382         if (pops->dtps_provide_module == NULL) {
7383                 ASSERT(pops->dtps_provide != NULL);
7384                 provider->dtpv_pops.dtps_provide_module =
7385                     (void (*)(void *, struct modctl *))dtrace_nullop;
7386         }
7387
7388         if (pops->dtps_suspend == NULL) {
7389                 ASSERT(pops->dtps_resume == NULL);
7390                 provider->dtpv_pops.dtps_suspend =
7391                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7392                 provider->dtpv_pops.dtps_resume =
7393                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7394         }
7395
7396         provider->dtpv_arg = arg;
7397         *idp = (dtrace_provider_id_t)provider;
7398
7399         if (pops == &dtrace_provider_ops) {
7400                 lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
7401                 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7402                 ASSERT(dtrace_anon.dta_enabling == NULL);
7403
7404                 /*
7405                  * We make sure that the DTrace provider is at the head of
7406                  * the provider chain.
7407                  */
7408                 provider->dtpv_next = dtrace_provider;
7409                 dtrace_provider = provider;
7410                 return (0);
7411         }
7412
7413         lck_mtx_lock(&dtrace_provider_lock);
7414         lck_mtx_lock(&dtrace_lock);
7415
7416         /*
7417          * If there is at least one provider registered, we'll add this
7418          * provider after the first provider.
7419          */
7420         if (dtrace_provider != NULL) {
7421                 provider->dtpv_next = dtrace_provider->dtpv_next;
7422                 dtrace_provider->dtpv_next = provider;
7423         } else {
7424                 dtrace_provider = provider;
7425         }
7426
7427         if (dtrace_retained != NULL) {
7428                 dtrace_enabling_provide(provider);
7429
7430                 /*
7431                  * Now we need to call dtrace_enabling_matchall() -- which
7432                  * will acquire cpu_lock and dtrace_lock.  We therefore need
7433                  * to drop all of our locks before calling into it...
7434                  */
7435                 lck_mtx_unlock(&dtrace_lock);
7436                 lck_mtx_unlock(&dtrace_provider_lock);
7437                 dtrace_enabling_matchall();
7438
7439                 return (0);
7440         }
7441
7442         lck_mtx_unlock(&dtrace_lock);
7443         lck_mtx_unlock(&dtrace_provider_lock);
7444
7445         return (0);
7446 }
7447
7448 /*
7449  * Unregister the specified provider from the DTrace framework.  This should
7450  * generally be called by DTrace providers in their detach(9E) entry point.
7451  */
7452 int
7453 dtrace_unregister(dtrace_provider_id_t id)
7454 {
7455         dtrace_provider_t *old = (dtrace_provider_t *)id;
7456         dtrace_provider_t *prev = NULL;
7457         int i, self = 0;
7458         dtrace_probe_t *probe, *first = NULL;
7459
7460         if (old->dtpv_pops.dtps_enable ==
7461             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
7462                 /*
7463                  * If DTrace itself is the provider, we're called with locks
7464                  * already held.
7465                  */
7466                 ASSERT(old == dtrace_provider);
7467                 ASSERT(dtrace_devi != NULL);
7468                 lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
7469                 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7470                 self = 1;
7471
7472                 if (dtrace_provider->dtpv_next != NULL) {
7473                         /*
7474                          * There's another provider here; return failure.
7475                          */
7476                         return (EBUSY);
7477                 }
7478         } else {
7479                 lck_mtx_lock(&dtrace_provider_lock);
7480                 lck_mtx_lock(&mod_lock);
7481                 lck_mtx_lock(&dtrace_lock);
7482         }
7483
7484         /*
7485          * If anyone has /dev/dtrace open, or if there are anonymous enabled
7486          * probes, we refuse to let providers slither away, unless this
7487          * provider has already been explicitly invalidated.
7488          */
7489         if (!old->dtpv_defunct &&
7490             (dtrace_opens || (dtrace_anon.dta_state != NULL &&
7491             dtrace_anon.dta_state->dts_necbs > 0))) {
7492                 if (!self) {
7493                         lck_mtx_unlock(&dtrace_lock);
7494                         lck_mtx_unlock(&mod_lock);
7495                         lck_mtx_unlock(&dtrace_provider_lock);
7496                 }
7497                 return (EBUSY);
7498         }
7499
7500         /*
7501          * Attempt to destroy the probes associated with this provider.
7502          */
7503         if (old->dtpv_ecb_count!=0) {
7504                 /*
7505                  * We have at least one ECB; we can't remove this provider.
7506                  */
7507                 if (!self) {
7508                         lck_mtx_unlock(&dtrace_lock);
7509                         lck_mtx_unlock(&mod_lock);
7510                         lck_mtx_unlock(&dtrace_provider_lock);
7511                 }
7512                 return (EBUSY);
7513         }
7514
7515         /*
7516          * All of the probes for this provider are disabled; we can safely
7517          * remove all of them from their hash chains and from the probe array.
7518          */
7519         for (i = 0; i < dtrace_nprobes && old->dtpv_probe_count!=0; i++) {
7520                 if ((probe = dtrace_probes[i]) == NULL)
7521                         continue;
7522
7523                 if (probe->dtpr_provider != old)
7524                         continue;
7525
7526                 dtrace_probes[i] = NULL;
7527                 old->dtpv_probe_count--;
7528
7529                 dtrace_hash_remove(dtrace_bymod, probe);
7530                 dtrace_hash_remove(dtrace_byfunc, probe);
7531                 dtrace_hash_remove(dtrace_byname, probe);
7532
7533                 if (first == NULL) {
7534                         first = probe;
7535                         probe->dtpr_nextmod = NULL;
7536                 } else {
7537                         probe->dtpr_nextmod = first;
7538                         first = probe;
7539                 }
7540         }
7541
7542         /*
7543          * The provider's probes have been removed from the hash chains and
7544          * from the probe array.  Now issue a dtrace_sync() to be sure that
7545          * everyone has cleared out from any probe array processing.
7546          */
7547         dtrace_sync();
7548
7549         for (probe = first; probe != NULL; probe = first) {
7550                 first = probe->dtpr_nextmod;
7551
7552                 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7553                     probe->dtpr_arg);
7554                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7555                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7556                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7557                 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7558                 zfree(dtrace_probe_t_zone, probe);
7559         }
7560
7561         if ((prev = dtrace_provider) == old) {
7562                 ASSERT(self || dtrace_devi == NULL);
7563                 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7564                 dtrace_provider = old->dtpv_next;
7565         } else {
7566                 while (prev != NULL && prev->dtpv_next != old)
7567                         prev = prev->dtpv_next;
7568
7569                 if (prev == NULL) {
7570                         panic("attempt to unregister non-existent "
7571                             "dtrace provider %p\n", (void *)id);
7572                 }
7573
7574                 prev->dtpv_next = old->dtpv_next;
7575         }
7576
7577         if (!self) {
7578                 lck_mtx_unlock(&dtrace_lock);
7579                 lck_mtx_unlock(&mod_lock);
7580                 lck_mtx_unlock(&dtrace_provider_lock);
7581         }
7582
7583         kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7584         kmem_free(old, sizeof (dtrace_provider_t));
7585
7586         return (0);
7587 }
7588
7589 /*
7590  * Invalidate the specified provider.  All subsequent probe lookups for the
7591  * specified provider will fail, but its probes will not be removed.
7592  */
7593 void
7594 dtrace_invalidate(dtrace_provider_id_t id)
7595 {
7596         dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7597
7598         ASSERT(pvp->dtpv_pops.dtps_enable !=
7599             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7600
7601         lck_mtx_lock(&dtrace_provider_lock);
7602         lck_mtx_lock(&dtrace_lock);
7603
7604         pvp->dtpv_defunct = 1;
7605
7606         lck_mtx_unlock(&dtrace_lock);
7607         lck_mtx_unlock(&dtrace_provider_lock);
7608 }
7609
7610 /*
7611  * Indicate whether or not DTrace has attached.
7612  */
7613 int
7614 dtrace_attached(void)
7615 {
7616         /*
7617          * dtrace_provider will be non-NULL iff the DTrace driver has
7618          * attached.  (It's non-NULL because DTrace is always itself a
7619          * provider.)
7620          */
7621         return (dtrace_provider != NULL);
7622 }
7623
7624 /*
7625  * Remove all the unenabled probes for the given provider.  This function is
7626  * not unlike dtrace_unregister(), except that it doesn't remove the provider
7627  * -- just as many of its associated probes as it can.
7628  */
7629 int
7630 dtrace_condense(dtrace_provider_id_t id)
7631 {
7632         dtrace_provider_t *prov = (dtrace_provider_t *)id;
7633         int i;
7634         dtrace_probe_t *probe;
7635
7636         /*
7637          * Make sure this isn't the dtrace provider itself.
7638          */
7639         ASSERT(prov->dtpv_pops.dtps_enable !=
7640           (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7641
7642         lck_mtx_lock(&dtrace_provider_lock);
7643         lck_mtx_lock(&dtrace_lock);
7644
7645         /*
7646          * Attempt to destroy the probes associated with this provider.
7647          */
7648         for (i = 0; i < dtrace_nprobes; i++) {
7649                 if ((probe = dtrace_probes[i]) == NULL)
7650                         continue;
7651
7652                 if (probe->dtpr_provider != prov)
7653                         continue;
7654
7655                 if (probe->dtpr_ecb != NULL)
7656                         continue;
7657
7658                 dtrace_probes[i] = NULL;
7659                 prov->dtpv_probe_count--;
7660
7661                 dtrace_hash_remove(dtrace_bymod, probe);
7662                 dtrace_hash_remove(dtrace_byfunc, probe);
7663                 dtrace_hash_remove(dtrace_byname, probe);
7664
7665                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7666                     probe->dtpr_arg);
7667                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7668                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7669                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7670                 zfree(dtrace_probe_t_zone, probe);
7671                 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7672         }
7673
7674         lck_mtx_unlock(&dtrace_lock);
7675         lck_mtx_unlock(&dtrace_provider_lock);
7676
7677         return (0);
7678 }
7679
7680 /*
7681  * DTrace Probe Management Functions
7682  *
7683  * The functions in this section perform the DTrace probe management,
7684  * including functions to create probes, look-up probes, and call into the
7685  * providers to request that probes be provided.  Some of these functions are
7686  * in the Provider-to-Framework API; these functions can be identified by the
7687  * fact that they are not declared "static".
7688  */
7689
7690 /*
7691  * Create a probe with the specified module name, function name, and name.
7692  */
7693 dtrace_id_t
7694 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7695     const char *func, const char *name, int aframes, void *arg)
7696 {
7697         dtrace_probe_t *probe, **probes;
7698         dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7699         dtrace_id_t id;
7700
7701         if (provider == dtrace_provider) {
7702                 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7703         } else {
7704                 lck_mtx_lock(&dtrace_lock);
7705         }
7706
7707         id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7708             VM_BESTFIT | VM_SLEEP);
7709
7710         probe = zalloc(dtrace_probe_t_zone);
7711         bzero(probe, sizeof (dtrace_probe_t));
7712
7713         probe->dtpr_id = id;
7714         probe->dtpr_gen = dtrace_probegen++;
7715         probe->dtpr_mod = dtrace_strdup(mod);
7716         probe->dtpr_func = dtrace_strdup(func);
7717         probe->dtpr_name = dtrace_strdup(name);
7718         probe->dtpr_arg = arg;
7719         probe->dtpr_aframes = aframes;
7720         probe->dtpr_provider = provider;
7721
7722         dtrace_hash_add(dtrace_bymod, probe);
7723         dtrace_hash_add(dtrace_byfunc, probe);
7724         dtrace_hash_add(dtrace_byname, probe);
7725
7726         if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
7727                 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7728                 size_t nsize = osize << 1;
7729
7730                 if (nsize == 0) {
7731                         ASSERT(osize == 0);
7732                         ASSERT(dtrace_probes == NULL);
7733                         nsize = sizeof (dtrace_probe_t *);
7734                 }
7735
7736                 probes = kmem_zalloc(nsize, KM_SLEEP);
7737
7738                 if (dtrace_probes == NULL) {
7739                         ASSERT(osize == 0);
7740                         dtrace_probes = probes;
7741                         dtrace_nprobes = 1;
7742                 } else {
7743                         dtrace_probe_t **oprobes = dtrace_probes;
7744
7745                         bcopy(oprobes, probes, osize);
7746                         dtrace_membar_producer();
7747                         dtrace_probes = probes;
7748
7749                         dtrace_sync();
7750
7751                         /*
7752                          * All CPUs are now seeing the new probes array; we can
7753                          * safely free the old array.
7754                          */
7755                         kmem_free(oprobes, osize);
7756                         dtrace_nprobes <<= 1;
7757                 }
7758
7759                 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
7760         }
7761
7762         ASSERT(dtrace_probes[id - 1] == NULL);
7763         dtrace_probes[id - 1] = probe;
7764         provider->dtpv_probe_count++;
7765
7766         if (provider != dtrace_provider)
7767                 lck_mtx_unlock(&dtrace_lock);
7768
7769         return (id);
7770 }
7771
7772 static dtrace_probe_t *
7773 dtrace_probe_lookup_id(dtrace_id_t id)
7774 {
7775         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7776
7777         if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
7778                 return (NULL);
7779
7780         return (dtrace_probes[id - 1]);
7781 }
7782
7783 static int
7784 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
7785 {
7786         *((dtrace_id_t *)arg) = probe->dtpr_id;
7787
7788         return (DTRACE_MATCH_DONE);
7789 }
7790
7791 /*
7792  * Look up a probe based on provider and one or more of module name, function
7793  * name and probe name.
7794  */
7795 dtrace_id_t
7796 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
7797     const char *func, const char *name)
7798 {
7799         dtrace_probekey_t pkey;
7800         dtrace_id_t id;
7801         int match;
7802
7803         pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
7804         pkey.dtpk_pmatch = &dtrace_match_string;
7805         pkey.dtpk_mod = mod;
7806         pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
7807         pkey.dtpk_func = func;
7808         pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
7809         pkey.dtpk_name = name;
7810         pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
7811         pkey.dtpk_id = DTRACE_IDNONE;
7812
7813         lck_mtx_lock(&dtrace_lock);
7814         match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
7815             dtrace_probe_lookup_match, &id);
7816         lck_mtx_unlock(&dtrace_lock);
7817
7818         ASSERT(match == 1 || match == 0);
7819         return (match ? id : 0);
7820 }
7821
7822 /*
7823  * Returns the probe argument associated with the specified probe.
7824  */
7825 void *
7826 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
7827 {
7828         dtrace_probe_t *probe;
7829         void *rval = NULL;
7830
7831         lck_mtx_lock(&dtrace_lock);
7832
7833         if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
7834             probe->dtpr_provider == (dtrace_provider_t *)id)
7835                 rval = probe->dtpr_arg;
7836
7837         lck_mtx_unlock(&dtrace_lock);
7838
7839         return (rval);
7840 }
7841
7842 /*
7843  * Copy a probe into a probe description.
7844  */
7845 static void
7846 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
7847 {
7848         bzero(pdp, sizeof (dtrace_probedesc_t));
7849         pdp->dtpd_id = prp->dtpr_id;
7850
7851         /* APPLE NOTE: Darwin employs size bounded string operation. */
7852         (void) strlcpy(pdp->dtpd_provider,
7853             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
7854
7855         (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
7856         (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
7857         (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
7858 }
7859
7860 /*
7861  * Called to indicate that a probe -- or probes -- should be provided by a
7862  * specfied provider.  If the specified description is NULL, the provider will
7863  * be told to provide all of its probes.  (This is done whenever a new
7864  * consumer comes along, or whenever a retained enabling is to be matched.) If
7865  * the specified description is non-NULL, the provider is given the
7866  * opportunity to dynamically provide the specified probe, allowing providers
7867  * to support the creation of probes on-the-fly.  (So-called _autocreated_
7868  * probes.)  If the provider is NULL, the operations will be applied to all
7869  * providers; if the provider is non-NULL the operations will only be applied
7870  * to the specified provider.  The dtrace_provider_lock must be held, and the
7871  * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
7872  * will need to grab the dtrace_lock when it reenters the framework through
7873  * dtrace_probe_lookup(), dtrace_probe_create(), etc.
7874  */
7875 static void
7876 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
7877 {
7878         struct modctl *ctl;
7879         int all = 0;
7880
7881         lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
7882
7883         if (prv == NULL) {
7884                 all = 1;
7885                 prv = dtrace_provider;
7886         }
7887
7888         do {
7889                 /*
7890                  * First, call the blanket provide operation.
7891                  */
7892                 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
7893
7894                 /*
7895                  * Now call the per-module provide operation.  We will grab
7896                  * mod_lock to prevent the list from being modified.  Note
7897                  * that this also prevents the mod_busy bits from changing.
7898                  * (mod_busy can only be changed with mod_lock held.)
7899                  */
7900                 lck_mtx_lock(&mod_lock);
7901
7902                 ctl = dtrace_modctl_list;
7903                 while (ctl) {
7904                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
7905                         ctl = ctl->mod_next;
7906                 }
7907
7908                 lck_mtx_unlock(&mod_lock);
7909         } while (all && (prv = prv->dtpv_next) != NULL);
7910 }
7911
7912 /*
7913  * Iterate over each probe, and call the Framework-to-Provider API function
7914  * denoted by offs.
7915  */
7916 static void
7917 dtrace_probe_foreach(uintptr_t offs)
7918 {
7919         dtrace_provider_t *prov;
7920         void (*func)(void *, dtrace_id_t, void *);
7921         dtrace_probe_t *probe;
7922         dtrace_icookie_t cookie;
7923         int i;
7924
7925         /*
7926          * We disable interrupts to walk through the probe array.  This is
7927          * safe -- the dtrace_sync() in dtrace_unregister() assures that we
7928          * won't see stale data.
7929          */
7930         cookie = dtrace_interrupt_disable();
7931
7932         for (i = 0; i < dtrace_nprobes; i++) {
7933                 if ((probe = dtrace_probes[i]) == NULL)
7934                         continue;
7935
7936                 if (probe->dtpr_ecb == NULL) {
7937                         /*
7938                          * This probe isn't enabled -- don't call the function.
7939                          */
7940                         continue;
7941                 }
7942
7943                 prov = probe->dtpr_provider;
7944                 func = *((void(**)(void *, dtrace_id_t, void *))
7945                     ((uintptr_t)&prov->dtpv_pops + offs));
7946
7947                 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
7948         }
7949
7950         dtrace_interrupt_enable(cookie);
7951 }
7952
7953 static int
7954 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
7955 {
7956         dtrace_probekey_t pkey;
7957         uint32_t priv;
7958         uid_t uid;
7959         zoneid_t zoneid;
7960
7961         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7962
7963         dtrace_ecb_create_cache = NULL;
7964
7965         if (desc == NULL) {
7966                 /*
7967                  * If we're passed a NULL description, we're being asked to
7968                  * create an ECB with a NULL probe.
7969                  */
7970                 (void) dtrace_ecb_create_enable(NULL, enab);
7971                 return (0);
7972         }
7973
7974         dtrace_probekey(desc, &pkey);
7975         dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
7976             &priv, &uid, &zoneid);
7977
7978         return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
7979             enab));
7980 }
7981
7982 /*
7983  * DTrace Helper Provider Functions
7984  */
7985 static void
7986 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
7987 {
7988         attr->dtat_name = DOF_ATTR_NAME(dofattr);
7989         attr->dtat_data = DOF_ATTR_DATA(dofattr);
7990         attr->dtat_class = DOF_ATTR_CLASS(dofattr);
7991 }
7992
7993 static void
7994 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
7995     const dof_provider_t *dofprov, char *strtab)
7996 {
7997         hprov->dthpv_provname = strtab + dofprov->dofpv_name;
7998         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
7999             dofprov->dofpv_provattr);
8000         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
8001             dofprov->dofpv_modattr);
8002         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
8003             dofprov->dofpv_funcattr);
8004         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
8005             dofprov->dofpv_nameattr);
8006         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
8007             dofprov->dofpv_argsattr);
8008 }
8009
8010 static void
8011 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8012 {
8013         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8014         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8015         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
8016         dof_provider_t *provider;
8017         dof_probe_t *probe;
8018         uint32_t *off, *enoff;
8019         uint8_t *arg;
8020         char *strtab;
8021         uint_t i, nprobes;
8022         dtrace_helper_provdesc_t dhpv;
8023         dtrace_helper_probedesc_t dhpb;
8024         dtrace_meta_t *meta = dtrace_meta_pid;
8025         dtrace_mops_t *mops = &meta->dtm_mops;
8026         void *parg;
8027
8028         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8029         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8030             provider->dofpv_strtab * dof->dofh_secsize);
8031         prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8032             provider->dofpv_probes * dof->dofh_secsize);
8033         arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8034             provider->dofpv_prargs * dof->dofh_secsize);
8035         off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8036             provider->dofpv_proffs * dof->dofh_secsize);
8037
8038         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8039         off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
8040         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
8041         enoff = NULL;
8042
8043         /*
8044          * See dtrace_helper_provider_validate().
8045          */
8046         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
8047             provider->dofpv_prenoffs != DOF_SECT_NONE) {
8048                 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8049                     provider->dofpv_prenoffs * dof->dofh_secsize);
8050                 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
8051         }
8052
8053         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
8054
8055         /*
8056          * Create the provider.
8057          */
8058         dtrace_dofprov2hprov(&dhpv, provider, strtab);
8059
8060         if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
8061                 return;
8062
8063         meta->dtm_count++;
8064
8065         /*
8066          * Create the probes.
8067          */
8068         for (i = 0; i < nprobes; i++) {
8069                 probe = (dof_probe_t *)(uintptr_t)(daddr +
8070                     prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
8071
8072                 dhpb.dthpb_mod = dhp->dofhp_mod;
8073                 dhpb.dthpb_func = strtab + probe->dofpr_func;
8074                 dhpb.dthpb_name = strtab + probe->dofpr_name;
8075 #if !defined(__APPLE__)
8076                 dhpb.dthpb_base = probe->dofpr_addr;
8077 #else
8078                 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
8079 #endif
8080                 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
8081                 dhpb.dthpb_noffs = probe->dofpr_noffs;
8082                 if (enoff != NULL) {
8083                         dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
8084                         dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8085                 } else {
8086                         dhpb.dthpb_enoffs = NULL;
8087                         dhpb.dthpb_nenoffs = 0;
8088                 }
8089                 dhpb.dthpb_args = arg + probe->dofpr_argidx;
8090                 dhpb.dthpb_nargc = probe->dofpr_nargc;
8091                 dhpb.dthpb_xargc = probe->dofpr_xargc;
8092                 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8093                 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8094
8095                 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8096         }
8097 }
8098
8099 static void
8100 dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
8101 {
8102         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8103         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8104         uint32_t i;
8105
8106         lck_mtx_assert(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8107
8108         for (i = 0; i < dof->dofh_secnum; i++) {
8109                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8110                     dof->dofh_secoff + i * dof->dofh_secsize);
8111
8112                 if (sec->dofs_type != DOF_SECT_PROVIDER)
8113                         continue;
8114
8115                 dtrace_helper_provide_one(dhp, sec, pid);
8116         }
8117
8118         /*
8119          * We may have just created probes, so we must now rematch against
8120          * any retained enablings.  Note that this call will acquire both
8121          * cpu_lock and dtrace_lock; the fact that we are holding
8122          * dtrace_meta_lock now is what defines the ordering with respect to
8123          * these three locks.
8124          */
8125         dtrace_enabling_matchall();
8126 }
8127
8128 static void
8129 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8130 {
8131         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8132         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8133         dof_sec_t *str_sec;
8134         dof_provider_t *provider;
8135         char *strtab;
8136         dtrace_helper_provdesc_t dhpv;
8137         dtrace_meta_t *meta = dtrace_meta_pid;
8138         dtrace_mops_t *mops = &meta->dtm_mops;
8139
8140         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8141         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8142             provider->dofpv_strtab * dof->dofh_secsize);
8143
8144         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8145
8146         /*
8147          * Create the provider.
8148          */
8149         dtrace_dofprov2hprov(&dhpv, provider, strtab);
8150
8151         mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
8152
8153         meta->dtm_count--;
8154 }
8155
8156 static void
8157 dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
8158 {
8159         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8160         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8161         uint32_t i;
8162
8163         lck_mtx_assert(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8164
8165         for (i = 0; i < dof->dofh_secnum; i++) {
8166                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8167                     dof->dofh_secoff + i * dof->dofh_secsize);
8168
8169                 if (sec->dofs_type != DOF_SECT_PROVIDER)
8170                         continue;
8171
8172                 dtrace_helper_provider_remove_one(dhp, sec, pid);
8173         }
8174 }
8175
8176 /*
8177  * DTrace Meta Provider-to-Framework API Functions
8178  *
8179  * These functions implement the Meta Provider-to-Framework API, as described
8180  * in <sys/dtrace.h>.
8181  */
8182 int
8183 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8184     dtrace_meta_provider_id_t *idp)
8185 {
8186         dtrace_meta_t *meta;
8187         dtrace_helpers_t *help, *next;
8188         uint_t i;
8189
8190         *idp = DTRACE_METAPROVNONE;
8191
8192         /*
8193          * We strictly don't need the name, but we hold onto it for
8194          * debuggability. All hail error queues!
8195          */
8196         if (name == NULL) {
8197                 cmn_err(CE_WARN, "failed to register meta-provider: "
8198                     "invalid name");
8199                 return (EINVAL);
8200         }
8201
8202         if (mops == NULL ||
8203             mops->dtms_create_probe == NULL ||
8204             mops->dtms_provide_pid == NULL ||
8205             mops->dtms_remove_pid == NULL) {
8206                 cmn_err(CE_WARN, "failed to register meta-register %s: "
8207                     "invalid ops", name);
8208                 return (EINVAL);
8209         }
8210
8211         meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8212         meta->dtm_mops = *mops;
8213
8214         /* APPLE NOTE: Darwin employs size bounded string operation. */
8215         {
8216         size_t bufsize = strlen(name) + 1;
8217         meta->dtm_name = kmem_alloc(bufsize, KM_SLEEP);
8218         (void) strlcpy(meta->dtm_name, name, bufsize);
8219         }
8220
8221         meta->dtm_arg = arg;
8222
8223         lck_mtx_lock(&dtrace_meta_lock);
8224         lck_mtx_lock(&dtrace_lock);
8225
8226         if (dtrace_meta_pid != NULL) {
8227                 lck_mtx_unlock(&dtrace_lock);
8228                 lck_mtx_unlock(&dtrace_meta_lock);
8229                 cmn_err(CE_WARN, "failed to register meta-register %s: "
8230                     "user-land meta-provider exists", name);
8231                 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
8232                 kmem_free(meta, sizeof (dtrace_meta_t));
8233                 return (EINVAL);
8234         }
8235
8236         dtrace_meta_pid = meta;
8237         *idp = (dtrace_meta_provider_id_t)meta;
8238
8239         /*
8240          * If there are providers and probes ready to go, pass them
8241          * off to the new meta provider now.
8242          */
8243
8244         help = dtrace_deferred_pid;
8245         dtrace_deferred_pid = NULL;
8246
8247         lck_mtx_unlock(&dtrace_lock);
8248
8249         while (help != NULL) {
8250                 for (i = 0; i < help->dthps_nprovs; i++) {
8251                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8252                             help->dthps_pid);
8253                 }
8254
8255                 next = help->dthps_next;
8256                 help->dthps_next = NULL;
8257                 help->dthps_prev = NULL;
8258                 help->dthps_deferred = 0;
8259                 help = next;
8260         }
8261
8262         lck_mtx_unlock(&dtrace_meta_lock);
8263
8264         return (0);
8265 }
8266
8267 int
8268 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8269 {
8270         dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8271
8272         lck_mtx_lock(&dtrace_meta_lock);
8273         lck_mtx_lock(&dtrace_lock);
8274
8275         if (old == dtrace_meta_pid) {
8276                 pp = &dtrace_meta_pid;
8277         } else {
8278                 panic("attempt to unregister non-existent "
8279                     "dtrace meta-provider %p\n", (void *)old);
8280         }
8281
8282         if (old->dtm_count != 0) {
8283                 lck_mtx_unlock(&dtrace_lock);
8284                 lck_mtx_unlock(&dtrace_meta_lock);
8285                 return (EBUSY);
8286         }
8287
8288         *pp = NULL;
8289
8290         lck_mtx_unlock(&dtrace_lock);
8291         lck_mtx_unlock(&dtrace_meta_lock);
8292
8293         kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
8294         kmem_free(old, sizeof (dtrace_meta_t));
8295
8296         return (0);
8297 }
8298
8299
8300 /*
8301  * DTrace DIF Object Functions
8302  */
8303 static int
8304 dtrace_difo_err(uint_t pc, const char *format, ...)
8305 {
8306         if (dtrace_err_verbose) {
8307                 va_list alist;
8308
8309                 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8310                 va_start(alist, format);
8311                 (void) vuprintf(format, alist);
8312                 va_end(alist);
8313         }
8314
8315 #ifdef DTRACE_ERRDEBUG
8316         dtrace_errdebug(format);
8317 #endif
8318         return (1);
8319 }
8320
8321 /*
8322  * Validate a DTrace DIF object by checking the IR instructions.  The following
8323  * rules are currently enforced by dtrace_difo_validate():
8324  *
8325  * 1. Each instruction must have a valid opcode
8326  * 2. Each register, string, variable, or subroutine reference must be valid
8327  * 3. No instruction can modify register %r0 (must be zero)
8328  * 4. All instruction reserved bits must be set to zero
8329  * 5. The last instruction must be a "ret" instruction
8330  * 6. All branch targets must reference a valid instruction _after_ the branch
8331  */
8332 static int
8333 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8334     cred_t *cr)
8335 {
8336         int err = 0;
8337         uint_t i;
8338
8339         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8340         int kcheckload;
8341         uint_t pc;
8342
8343         kcheckload = cr == NULL ||
8344             (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8345
8346         dp->dtdo_destructive = 0;
8347
8348         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8349                 dif_instr_t instr = dp->dtdo_buf[pc];
8350
8351                 uint_t r1 = DIF_INSTR_R1(instr);
8352                 uint_t r2 = DIF_INSTR_R2(instr);
8353                 uint_t rd = DIF_INSTR_RD(instr);
8354                 uint_t rs = DIF_INSTR_RS(instr);
8355                 uint_t label = DIF_INSTR_LABEL(instr);
8356                 uint_t v = DIF_INSTR_VAR(instr);
8357                 uint_t subr = DIF_INSTR_SUBR(instr);
8358                 uint_t type = DIF_INSTR_TYPE(instr);
8359                 uint_t op = DIF_INSTR_OP(instr);
8360
8361                 switch (op) {
8362                 case DIF_OP_OR:
8363                 case DIF_OP_XOR:
8364                 case DIF_OP_AND:
8365                 case DIF_OP_SLL:
8366                 case DIF_OP_SRL:
8367                 case DIF_OP_SRA:
8368                 case DIF_OP_SUB:
8369                 case DIF_OP_ADD:
8370                 case DIF_OP_MUL:
8371                 case DIF_OP_SDIV:
8372                 case DIF_OP_UDIV:
8373                 case DIF_OP_SREM:
8374                 case DIF_OP_UREM:
8375                 case DIF_OP_COPYS:
8376                         if (r1 >= nregs)
8377                                 err += efunc(pc, "invalid register %u\n", r1);
8378                         if (r2 >= nregs)
8379                                 err += efunc(pc, "invalid register %u\n", r2);
8380                         if (rd >= nregs)
8381                                 err += efunc(pc, "invalid register %u\n", rd);
8382                         if (rd == 0)
8383                                 err += efunc(pc, "cannot write to %r0\n");
8384                         break;
8385                 case DIF_OP_NOT:
8386                 case DIF_OP_MOV:
8387                 case DIF_OP_ALLOCS:
8388                         if (r1 >= nregs)
8389                                 err += efunc(pc, "invalid register %u\n", r1);
8390                         if (r2 != 0)
8391                                 err += efunc(pc, "non-zero reserved bits\n");
8392                         if (rd >= nregs)
8393                                 err += efunc(pc, "invalid register %u\n", rd);
8394                         if (rd == 0)
8395                                 err += efunc(pc, "cannot write to %r0\n");
8396                         break;
8397                 case DIF_OP_LDSB:
8398                 case DIF_OP_LDSH:
8399                 case DIF_OP_LDSW:
8400                 case DIF_OP_LDUB:
8401                 case DIF_OP_LDUH:
8402                 case DIF_OP_LDUW:
8403                 case DIF_OP_LDX:
8404                         if (r1 >= nregs)
8405                                 err += efunc(pc, "invalid register %u\n", r1);
8406                         if (r2 != 0)
8407                                 err += efunc(pc, "non-zero reserved bits\n");
8408                         if (rd >= nregs)
8409                                 err += efunc(pc, "invalid register %u\n", rd);
8410                         if (rd == 0)
8411                                 err += efunc(pc, "cannot write to %r0\n");
8412                         if (kcheckload)
8413                                 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
8414                                     DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
8415                         break;
8416                 case DIF_OP_RLDSB:
8417                 case DIF_OP_RLDSH:
8418                 case DIF_OP_RLDSW:
8419                 case DIF_OP_RLDUB:
8420                 case DIF_OP_RLDUH:
8421                 case DIF_OP_RLDUW:
8422                 case DIF_OP_RLDX:
8423                         if (r1 >= nregs)
8424                                 err += efunc(pc, "invalid register %u\n", r1);
8425                         if (r2 != 0)
8426                                 err += efunc(pc, "non-zero reserved bits\n");
8427                         if (rd >= nregs)
8428                                 err += efunc(pc, "invalid register %u\n", rd);
8429                         if (rd == 0)
8430                                 err += efunc(pc, "cannot write to %r0\n");
8431                         break;
8432                 case DIF_OP_ULDSB:
8433                 case DIF_OP_ULDSH:
8434                 case DIF_OP_ULDSW:
8435                 case DIF_OP_ULDUB:
8436                 case DIF_OP_ULDUH:
8437                 case DIF_OP_ULDUW:
8438                 case DIF_OP_ULDX:
8439                         if (r1 >= nregs)
8440                                 err += efunc(pc, "invalid register %u\n", r1);
8441                         if (r2 != 0)
8442                                 err += efunc(pc, "non-zero reserved bits\n");
8443                         if (rd >= nregs)
8444                                 err += efunc(pc, "invalid register %u\n", rd);
8445                         if (rd == 0)
8446                                 err += efunc(pc, "cannot write to %r0\n");
8447                         break;
8448                 case DIF_OP_STB:
8449                 case DIF_OP_STH:
8450                 case DIF_OP_STW:
8451                 case DIF_OP_STX:
8452                         if (r1 >= nregs)
8453                                 err += efunc(pc, "invalid register %u\n", r1);
8454                         if (r2 != 0)
8455                                 err += efunc(pc, "non-zero reserved bits\n");
8456                         if (rd >= nregs)
8457                                 err += efunc(pc, "invalid register %u\n", rd);
8458                         if (rd == 0)
8459                                 err += efunc(pc, "cannot write to 0 address\n");
8460                         break;
8461                 case DIF_OP_CMP:
8462                 case DIF_OP_SCMP:
8463                         if (r1 >= nregs)
8464                                 err += efunc(pc, "invalid register %u\n", r1);
8465                         if (r2 >= nregs)
8466                                 err += efunc(pc, "invalid register %u\n", r2);
8467                         if (rd != 0)
8468                                 err += efunc(pc, "non-zero reserved bits\n");
8469                         break;
8470                 case DIF_OP_TST:
8471                         if (r1 >= nregs)
8472                                 err += efunc(pc, "invalid register %u\n", r1);
8473                         if (r2 != 0 || rd != 0)
8474                                 err += efunc(pc, "non-zero reserved bits\n");
8475                         break;
8476                 case DIF_OP_BA:
8477                 case DIF_OP_BE:
8478                 case DIF_OP_BNE:
8479                 case DIF_OP_BG:
8480                 case DIF_OP_BGU:
8481                 case DIF_OP_BGE:
8482                 case DIF_OP_BGEU:
8483                 case DIF_OP_BL:
8484                 case DIF_OP_BLU:
8485                 case DIF_OP_BLE:
8486                 case DIF_OP_BLEU:
8487                         if (label >= dp->dtdo_len) {
8488                                 err += efunc(pc, "invalid branch target %u\n",
8489                                     label);
8490                         }
8491                         if (label <= pc) {
8492                                 err += efunc(pc, "backward branch to %u\n",
8493                                     label);
8494                         }
8495                         break;
8496                 case DIF_OP_RET:
8497                         if (r1 != 0 || r2 != 0)
8498                                 err += efunc(pc, "non-zero reserved bits\n");
8499                         if (rd >= nregs)
8500                                 err += efunc(pc, "invalid register %u\n", rd);
8501                         break;
8502                 case DIF_OP_NOP:
8503                 case DIF_OP_POPTS:
8504                 case DIF_OP_FLUSHTS:
8505                         if (r1 != 0 || r2 != 0 || rd != 0)
8506                                 err += efunc(pc, "non-zero reserved bits\n");
8507                         break;
8508                 case DIF_OP_SETX:
8509                         if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
8510                                 err += efunc(pc, "invalid integer ref %u\n",
8511                                     DIF_INSTR_INTEGER(instr));
8512                         }
8513                         if (rd >= nregs)
8514                                 err += efunc(pc, "invalid register %u\n", rd);
8515                         if (rd == 0)
8516                                 err += efunc(pc, "cannot write to %r0\n");
8517                         break;
8518                 case DIF_OP_SETS:
8519                         if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8520                                 err += efunc(pc, "invalid string ref %u\n",
8521                                     DIF_INSTR_STRING(instr));
8522                         }
8523                         if (rd >= nregs)
8524                                 err += efunc(pc, "invalid register %u\n", rd);
8525                         if (rd == 0)
8526                                 err += efunc(pc, "cannot write to %r0\n");
8527                         break;
8528                 case DIF_OP_LDGA:
8529                 case DIF_OP_LDTA:
8530                         if (r1 > DIF_VAR_ARRAY_MAX)
8531                                 err += efunc(pc, "invalid array %u\n", r1);
8532                         if (r2 >= nregs)
8533                                 err += efunc(pc, "invalid register %u\n", r2);
8534                         if (rd >= nregs)
8535                                 err += efunc(pc, "invalid register %u\n", rd);
8536                         if (rd == 0)
8537                                 err += efunc(pc, "cannot write to %r0\n");
8538                         break;
8539                 case DIF_OP_LDGS:
8540                 case DIF_OP_LDTS:
8541                 case DIF_OP_LDLS:
8542                 case DIF_OP_LDGAA:
8543                 case DIF_OP_LDTAA:
8544                         if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8545                                 err += efunc(pc, "invalid variable %u\n", v);
8546                         if (rd >= nregs)
8547                                 err += efunc(pc, "invalid register %u\n", rd);
8548                         if (rd == 0)
8549                                 err += efunc(pc, "cannot write to %r0\n");
8550                         break;
8551                 case DIF_OP_STGS:
8552                 case DIF_OP_STTS:
8553                 case DIF_OP_STLS:
8554                 case DIF_OP_STGAA:
8555                 case DIF_OP_STTAA:
8556                         if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8557                                 err += efunc(pc, "invalid variable %u\n", v);
8558                         if (rs >= nregs)
8559                                 err += efunc(pc, "invalid register %u\n", rd);
8560                         break;
8561                 case DIF_OP_CALL:
8562                         if (subr > DIF_SUBR_MAX)
8563                                 err += efunc(pc, "invalid subr %u\n", subr);
8564                         if (rd >= nregs)
8565                                 err += efunc(pc, "invalid register %u\n", rd);
8566                         if (rd == 0)
8567                                 err += efunc(pc, "cannot write to %r0\n");
8568
8569                         if (subr == DIF_SUBR_COPYOUT ||
8570                             subr == DIF_SUBR_COPYOUTSTR) {
8571                                 dp->dtdo_destructive = 1;
8572                         }
8573                         break;
8574                 case DIF_OP_PUSHTR:
8575                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8576                                 err += efunc(pc, "invalid ref type %u\n", type);
8577                         if (r2 >= nregs)
8578                                 err += efunc(pc, "invalid register %u\n", r2);
8579                         if (rs >= nregs)
8580                                 err += efunc(pc, "invalid register %u\n", rs);
8581                         break;
8582                 case DIF_OP_PUSHTV:
8583                         if (type != DIF_TYPE_CTF)
8584                                 err += efunc(pc, "invalid val type %u\n", type);
8585                         if (r2 >= nregs)
8586                                 err += efunc(pc, "invalid register %u\n", r2);
8587                         if (rs >= nregs)
8588                                 err += efunc(pc, "invalid register %u\n", rs);
8589                         break;
8590                 default:
8591                         err += efunc(pc, "invalid opcode %u\n",
8592                             DIF_INSTR_OP(instr));
8593                 }
8594         }
8595
8596         if (dp->dtdo_len != 0 &&
8597             DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8598                 err += efunc(dp->dtdo_len - 1,
8599                     "expected 'ret' as last DIF instruction\n");
8600         }
8601
8602         if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8603                 /*
8604                  * If we're not returning by reference, the size must be either
8605                  * 0 or the size of one of the base types.
8606                  */
8607                 switch (dp->dtdo_rtype.dtdt_size) {
8608                 case 0:
8609                 case sizeof (uint8_t):
8610                 case sizeof (uint16_t):
8611                 case sizeof (uint32_t):
8612                 case sizeof (uint64_t):
8613                         break;
8614
8615                 default:
8616                         err += efunc(dp->dtdo_len - 1, "bad return size\n");
8617                 }
8618         }
8619
8620         for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8621                 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8622                 dtrace_diftype_t *vt, *et;
8623                 uint_t id;
8624                 int ndx;
8625
8626                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8627                     v->dtdv_scope != DIFV_SCOPE_THREAD &&
8628                     v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8629                         err += efunc(i, "unrecognized variable scope %d\n",
8630                             v->dtdv_scope);
8631                         break;
8632                 }
8633
8634                 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8635                     v->dtdv_kind != DIFV_KIND_SCALAR) {
8636                         err += efunc(i, "unrecognized variable type %d\n",
8637                             v->dtdv_kind);
8638                         break;
8639                 }
8640
8641                 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8642                         err += efunc(i, "%d exceeds variable id limit\n", id);
8643                         break;
8644                 }
8645
8646                 if (id < DIF_VAR_OTHER_UBASE)
8647                         continue;
8648
8649                 /*
8650                  * For user-defined variables, we need to check that this
8651                  * definition is identical to any previous definition that we
8652                  * encountered.
8653                  */
8654                 ndx = id - DIF_VAR_OTHER_UBASE;
8655
8656                 switch (v->dtdv_scope) {
8657                 case DIFV_SCOPE_GLOBAL:
8658                         if (ndx < vstate->dtvs_nglobals) {
8659                                 dtrace_statvar_t *svar;
8660
8661                                 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8662                                         existing = &svar->dtsv_var;
8663                         }
8664
8665                         break;
8666
8667                 case DIFV_SCOPE_THREAD:
8668                         if (ndx < vstate->dtvs_ntlocals)
8669                                 existing = &vstate->dtvs_tlocals[ndx];
8670                         break;
8671
8672                 case DIFV_SCOPE_LOCAL:
8673                         if (ndx < vstate->dtvs_nlocals) {
8674                                 dtrace_statvar_t *svar;
8675
8676                                 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8677                                         existing = &svar->dtsv_var;
8678                         }
8679
8680                         break;
8681                 }
8682
8683                 vt = &v->dtdv_type;
8684
8685                 if (vt->dtdt_flags & DIF_TF_BYREF) {
8686                         if (vt->dtdt_size == 0) {
8687                                 err += efunc(i, "zero-sized variable\n");
8688                                 break;
8689                         }
8690
8691                         if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8692                             vt->dtdt_size > dtrace_global_maxsize) {
8693                                 err += efunc(i, "oversized by-ref global\n");
8694                                 break;
8695                         }
8696                 }
8697
8698                 if (existing == NULL || existing->dtdv_id == 0)
8699                         continue;
8700
8701                 ASSERT(existing->dtdv_id == v->dtdv_id);
8702                 ASSERT(existing->dtdv_scope == v->dtdv_scope);
8703
8704                 if (existing->dtdv_kind != v->dtdv_kind)
8705                         err += efunc(i, "%d changed variable kind\n", id);
8706
8707                 et = &existing->dtdv_type;
8708
8709                 if (vt->dtdt_flags != et->dtdt_flags) {
8710                         err += efunc(i, "%d changed variable type flags\n", id);
8711                         break;
8712                 }
8713
8714                 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8715                         err += efunc(i, "%d changed variable type size\n", id);
8716                         break;
8717                 }
8718         }
8719
8720         return (err);
8721 }
8722
8723 /*
8724  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
8725  * are much more constrained than normal DIFOs.  Specifically, they may
8726  * not:
8727  *
8728  * 1. Make calls to subroutines other than copyin(), copyinstr() or
8729  *    miscellaneous string routines
8730  * 2. Access DTrace variables other than the args[] array, and the
8731  *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8732  * 3. Have thread-local variables.
8733  * 4. Have dynamic variables.
8734  */
8735 static int
8736 dtrace_difo_validate_helper(dtrace_difo_t *dp)
8737 {
8738         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8739         int err = 0;
8740         uint_t pc;
8741
8742         for (pc = 0; pc < dp->dtdo_len; pc++) {
8743                 dif_instr_t instr = dp->dtdo_buf[pc];
8744
8745                 uint_t v = DIF_INSTR_VAR(instr);
8746                 uint_t subr = DIF_INSTR_SUBR(instr);
8747                 uint_t op = DIF_INSTR_OP(instr);
8748
8749                 switch (op) {
8750                 case DIF_OP_OR:
8751                 case DIF_OP_XOR:
8752                 case DIF_OP_AND:
8753                 case DIF_OP_SLL:
8754                 case DIF_OP_SRL:
8755                 case DIF_OP_SRA:
8756                 case DIF_OP_SUB:
8757                 case DIF_OP_ADD:
8758                 case DIF_OP_MUL:
8759                 case DIF_OP_SDIV:
8760                 case DIF_OP_UDIV:
8761                 case DIF_OP_SREM:
8762                 case DIF_OP_UREM:
8763                 case DIF_OP_COPYS:
8764                 case DIF_OP_NOT:
8765                 case DIF_OP_MOV:
8766                 case DIF_OP_RLDSB:
8767                 case DIF_OP_RLDSH:
8768                 case DIF_OP_RLDSW:
8769                 case DIF_OP_RLDUB:
8770                 case DIF_OP_RLDUH:
8771                 case DIF_OP_RLDUW:
8772                 case DIF_OP_RLDX:
8773                 case DIF_OP_ULDSB:
8774                 case DIF_OP_ULDSH:
8775                 case DIF_OP_ULDSW:
8776                 case DIF_OP_ULDUB:
8777                 case DIF_OP_ULDUH:
8778                 case DIF_OP_ULDUW:
8779                 case DIF_OP_ULDX:
8780                 case DIF_OP_STB:
8781                 case DIF_OP_STH:
8782                 case DIF_OP_STW:
8783                 case DIF_OP_STX:
8784                 case DIF_OP_ALLOCS:
8785                 case DIF_OP_CMP:
8786                 case DIF_OP_SCMP:
8787                 case DIF_OP_TST:
8788                 case DIF_OP_BA:
8789                 case DIF_OP_BE:
8790                 case DIF_OP_BNE:
8791                 case DIF_OP_BG:
8792                 case DIF_OP_BGU:
8793                 case DIF_OP_BGE:
8794                 case DIF_OP_BGEU:
8795                 case DIF_OP_BL:
8796                 case DIF_OP_BLU:
8797                 case DIF_OP_BLE:
8798                 case DIF_OP_BLEU:
8799                 case DIF_OP_RET:
8800                 case DIF_OP_NOP:
8801                 case DIF_OP_POPTS:
8802                 case DIF_OP_FLUSHTS:
8803                 case DIF_OP_SETX:
8804                 case DIF_OP_SETS:
8805                 case DIF_OP_LDGA:
8806                 case DIF_OP_LDLS:
8807                 case DIF_OP_STGS:
8808                 case DIF_OP_STLS:
8809                 case DIF_OP_PUSHTR:
8810                 case DIF_OP_PUSHTV:
8811                         break;
8812
8813                 case DIF_OP_LDGS:
8814                         if (v >= DIF_VAR_OTHER_UBASE)
8815                                 break;
8816
8817                         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
8818                                 break;
8819
8820                         if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
8821                             v == DIF_VAR_PPID || v == DIF_VAR_TID ||
8822                             v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
8823                             v == DIF_VAR_UID || v == DIF_VAR_GID)
8824                                 break;
8825
8826                         err += efunc(pc, "illegal variable %u\n", v);
8827                         break;
8828
8829                 case DIF_OP_LDTA:
8830                 case DIF_OP_LDTS:
8831                 case DIF_OP_LDGAA:
8832                 case DIF_OP_LDTAA:
8833                         err += efunc(pc, "illegal dynamic variable load\n");
8834                         break;
8835
8836                 case DIF_OP_STTS:
8837                 case DIF_OP_STGAA:
8838                 case DIF_OP_STTAA:
8839                         err += efunc(pc, "illegal dynamic variable store\n");
8840                         break;
8841
8842                 case DIF_OP_CALL:
8843                         if (subr == DIF_SUBR_ALLOCA ||
8844                             subr == DIF_SUBR_BCOPY ||
8845                             subr == DIF_SUBR_COPYIN ||
8846                             subr == DIF_SUBR_COPYINTO ||
8847                             subr == DIF_SUBR_COPYINSTR ||
8848                             subr == DIF_SUBR_INDEX ||
8849                             subr == DIF_SUBR_INET_NTOA ||
8850                             subr == DIF_SUBR_INET_NTOA6 ||
8851                             subr == DIF_SUBR_INET_NTOP ||
8852                             subr == DIF_SUBR_LLTOSTR ||
8853                             subr == DIF_SUBR_RINDEX ||
8854                             subr == DIF_SUBR_STRCHR ||
8855                             subr == DIF_SUBR_STRJOIN ||
8856                             subr == DIF_SUBR_STRRCHR ||
8857                             subr == DIF_SUBR_STRSTR ||
8858                             subr == DIF_SUBR_COREPROFILE ||
8859                             subr == DIF_SUBR_HTONS ||
8860                             subr == DIF_SUBR_HTONL ||
8861                             subr == DIF_SUBR_HTONLL ||
8862                             subr == DIF_SUBR_NTOHS ||
8863                             subr == DIF_SUBR_NTOHL ||
8864                             subr == DIF_SUBR_NTOHLL)
8865                                 break;
8866
8867                         err += efunc(pc, "invalid subr %u\n", subr);
8868                         break;
8869
8870                 default:
8871                         err += efunc(pc, "invalid opcode %u\n",
8872                             DIF_INSTR_OP(instr));
8873                 }
8874         }
8875
8876         return (err);
8877 }
8878
8879 /*
8880  * Returns 1 if the expression in the DIF object can be cached on a per-thread
8881  * basis; 0 if not.
8882  */
8883 static int
8884 dtrace_difo_cacheable(dtrace_difo_t *dp)
8885 {
8886         uint_t i;
8887
8888         if (dp == NULL)
8889                 return (0);
8890
8891         for (i = 0; i < dp->dtdo_varlen; i++) {
8892                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8893
8894                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
8895                         continue;
8896
8897                 switch (v->dtdv_id) {
8898                 case DIF_VAR_CURTHREAD:
8899                 case DIF_VAR_PID:
8900                 case DIF_VAR_TID:
8901                 case DIF_VAR_EXECNAME:
8902                 case DIF_VAR_ZONENAME:
8903                         break;
8904
8905                 default:
8906                         return (0);
8907                 }
8908         }
8909
8910         /*
8911          * This DIF object may be cacheable.  Now we need to look for any
8912          * array loading instructions, any memory loading instructions, or
8913          * any stores to thread-local variables.
8914          */
8915         for (i = 0; i < dp->dtdo_len; i++) {
8916                 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
8917
8918                 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
8919                     (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
8920                     (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
8921                     op == DIF_OP_LDGA || op == DIF_OP_STTS)
8922                         return (0);
8923         }
8924
8925         return (1);
8926 }
8927
8928 static void
8929 dtrace_difo_hold(dtrace_difo_t *dp)
8930 {
8931         uint_t i;
8932
8933         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8934
8935         dp->dtdo_refcnt++;
8936         ASSERT(dp->dtdo_refcnt != 0);
8937
8938         /*
8939          * We need to check this DIF object for references to the variable
8940          * DIF_VAR_VTIMESTAMP.
8941          */
8942         for (i = 0; i < dp->dtdo_varlen; i++) {
8943                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8944
8945                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8946                         continue;
8947
8948                 if (dtrace_vtime_references++ == 0)
8949                         dtrace_vtime_enable();
8950         }
8951 }
8952
8953 /*
8954  * This routine calculates the dynamic variable chunksize for a given DIF
8955  * object.  The calculation is not fool-proof, and can probably be tricked by
8956  * malicious DIF -- but it works for all compiler-generated DIF.  Because this
8957  * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
8958  * if a dynamic variable size exceeds the chunksize.
8959  */
8960 static void
8961 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8962 {
8963         uint64_t sval = 0;
8964         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
8965         const dif_instr_t *text = dp->dtdo_buf;
8966         uint_t pc, srd = 0;
8967         uint_t ttop = 0;
8968         size_t size, ksize;
8969         uint_t id, i;
8970
8971         for (pc = 0; pc < dp->dtdo_len; pc++) {
8972                 dif_instr_t instr = text[pc];
8973                 uint_t op = DIF_INSTR_OP(instr);
8974                 uint_t rd = DIF_INSTR_RD(instr);
8975                 uint_t r1 = DIF_INSTR_R1(instr);
8976                 uint_t nkeys = 0;
8977                 uchar_t scope;
8978
8979                 dtrace_key_t *key = tupregs;
8980
8981                 switch (op) {
8982                 case DIF_OP_SETX:
8983                         sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
8984                         srd = rd;
8985                         continue;
8986
8987                 case DIF_OP_STTS:
8988                         key = &tupregs[DIF_DTR_NREGS];
8989                         key[0].dttk_size = 0;
8990                         key[1].dttk_size = 0;
8991                         nkeys = 2;
8992                         scope = DIFV_SCOPE_THREAD;
8993                         break;
8994
8995                 case DIF_OP_STGAA:
8996                 case DIF_OP_STTAA:
8997                         nkeys = ttop;
8998
8999                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
9000                                 key[nkeys++].dttk_size = 0;
9001
9002                         key[nkeys++].dttk_size = 0;
9003
9004                         if (op == DIF_OP_STTAA) {
9005                                 scope = DIFV_SCOPE_THREAD;
9006                         } else {
9007                                 scope = DIFV_SCOPE_GLOBAL;
9008                         }
9009
9010                         break;
9011
9012                 case DIF_OP_PUSHTR:
9013                         if (ttop == DIF_DTR_NREGS)
9014                                 return;
9015
9016                         if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
9017                                 /*
9018                                  * If the register for the size of the "pushtr"
9019                                  * is %r0 (or the value is 0) and the type is
9020                                  * a string, we'll use the system-wide default
9021                                  * string size.
9022                                  */
9023                                 tupregs[ttop++].dttk_size =
9024                                     dtrace_strsize_default;
9025                         } else {
9026                                 if (srd == 0)
9027                                         return;
9028
9029                                 tupregs[ttop++].dttk_size = sval;
9030                         }
9031
9032                         break;
9033
9034                 case DIF_OP_PUSHTV:
9035                         if (ttop == DIF_DTR_NREGS)
9036                                 return;
9037
9038                         tupregs[ttop++].dttk_size = 0;
9039                         break;
9040
9041                 case DIF_OP_FLUSHTS:
9042                         ttop = 0;
9043                         break;
9044
9045                 case DIF_OP_POPTS:
9046                         if (ttop != 0)
9047                                 ttop--;
9048                         break;
9049                 }
9050
9051                 sval = 0;
9052                 srd = 0;
9053
9054                 if (nkeys == 0)
9055                         continue;
9056
9057                 /*
9058                  * We have a dynamic variable allocation; calculate its size.
9059                  */
9060                 for (ksize = 0, i = 0; i < nkeys; i++)
9061                         ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
9062
9063                 size = sizeof (dtrace_dynvar_t);
9064                 size += sizeof (dtrace_key_t) * (nkeys - 1);
9065                 size += ksize;
9066
9067                 /*
9068                  * Now we need to determine the size of the stored data.
9069                  */
9070                 id = DIF_INSTR_VAR(instr);
9071
9072                 for (i = 0; i < dp->dtdo_varlen; i++) {
9073                         dtrace_difv_t *v = &dp->dtdo_vartab[i];
9074
9075                         if (v->dtdv_id == id && v->dtdv_scope == scope) {
9076                                 size += v->dtdv_type.dtdt_size;
9077                                 break;
9078                         }
9079                 }
9080
9081                 if (i == dp->dtdo_varlen)
9082                         return;
9083
9084                 /*
9085                  * We have the size.  If this is larger than the chunk size
9086                  * for our dynamic variable state, reset the chunk size.
9087                  */
9088                 size = P2ROUNDUP(size, sizeof (uint64_t));
9089
9090                 if (size > vstate->dtvs_dynvars.dtds_chunksize)
9091                         vstate->dtvs_dynvars.dtds_chunksize = size;
9092         }
9093 }
9094
9095 static void
9096 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9097 {
9098         int oldsvars, osz, nsz, otlocals, ntlocals;
9099         uint_t i, id;
9100
9101         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9102         ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
9103
9104         for (i = 0; i < dp->dtdo_varlen; i++) {
9105                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9106                 dtrace_statvar_t *svar;
9107                 dtrace_statvar_t ***svarp = NULL;
9108                 size_t dsize = 0;
9109                 uint8_t scope = v->dtdv_scope;
9110                 int *np = (int *)NULL;
9111
9112                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9113                         continue;
9114
9115                 id -= DIF_VAR_OTHER_UBASE;
9116
9117                 switch (scope) {
9118                 case DIFV_SCOPE_THREAD:
9119                         while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
9120                                 dtrace_difv_t *tlocals;
9121
9122                                 if ((ntlocals = (otlocals << 1)) == 0)
9123                                         ntlocals = 1;
9124
9125                                 osz = otlocals * sizeof (dtrace_difv_t);
9126                                 nsz = ntlocals * sizeof (dtrace_difv_t);
9127
9128                                 tlocals = kmem_zalloc(nsz, KM_SLEEP);
9129
9130                                 if (osz != 0) {
9131                                         bcopy(vstate->dtvs_tlocals,
9132                                             tlocals, osz);
9133                                         kmem_free(vstate->dtvs_tlocals, osz);
9134                                 }
9135
9136                                 vstate->dtvs_tlocals = tlocals;
9137                                 vstate->dtvs_ntlocals = ntlocals;
9138                         }
9139
9140                         vstate->dtvs_tlocals[id] = *v;
9141                         continue;
9142
9143                 case DIFV_SCOPE_LOCAL:
9144                         np = &vstate->dtvs_nlocals;
9145                         svarp = &vstate->dtvs_locals;
9146
9147                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9148                                 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
9149                                     sizeof (uint64_t));
9150                         else
9151                                 dsize = (int)NCPU * sizeof (uint64_t);
9152
9153                         break;
9154
9155                 case DIFV_SCOPE_GLOBAL:
9156                         np = &vstate->dtvs_nglobals;
9157                         svarp = &vstate->dtvs_globals;
9158
9159                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9160                                 dsize = v->dtdv_type.dtdt_size +
9161                                     sizeof (uint64_t);
9162
9163                         break;
9164
9165                 default:
9166                         ASSERT(0);
9167                 }
9168
9169                 while (id >= (uint_t)(oldsvars = *np)) {
9170                         dtrace_statvar_t **statics;
9171                         int newsvars, oldsize, newsize;
9172
9173                         if ((newsvars = (oldsvars << 1)) == 0)
9174                                 newsvars = 1;
9175
9176                         oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9177                         newsize = newsvars * sizeof (dtrace_statvar_t *);
9178
9179                         statics = kmem_zalloc(newsize, KM_SLEEP);
9180
9181                         if (oldsize != 0) {
9182                                 bcopy(*svarp, statics, oldsize);
9183                                 kmem_free(*svarp, oldsize);
9184                         }
9185
9186                         *svarp = statics;
9187                         *np = newsvars;
9188                 }
9189
9190                 if ((svar = (*svarp)[id]) == NULL) {
9191                         svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9192                         svar->dtsv_var = *v;
9193
9194                         if ((svar->dtsv_size = dsize) != 0) {
9195                                 svar->dtsv_data = (uint64_t)(uintptr_t)
9196                                     kmem_zalloc(dsize, KM_SLEEP);
9197                         }
9198
9199                         (*svarp)[id] = svar;
9200                 }
9201
9202                 svar->dtsv_refcnt++;
9203         }
9204
9205         dtrace_difo_chunksize(dp, vstate);
9206         dtrace_difo_hold(dp);
9207 }
9208
9209 static dtrace_difo_t *
9210 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9211 {
9212         dtrace_difo_t *new;
9213         size_t sz;
9214
9215         ASSERT(dp->dtdo_buf != NULL);
9216         ASSERT(dp->dtdo_refcnt != 0);
9217
9218         new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9219
9220         ASSERT(dp->dtdo_buf != NULL);
9221         sz = dp->dtdo_len * sizeof (dif_instr_t);
9222         new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9223         bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9224         new->dtdo_len = dp->dtdo_len;
9225
9226         if (dp->dtdo_strtab != NULL) {
9227                 ASSERT(dp->dtdo_strlen != 0);
9228                 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9229                 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9230                 new->dtdo_strlen = dp->dtdo_strlen;
9231         }
9232
9233         if (dp->dtdo_inttab != NULL) {
9234                 ASSERT(dp->dtdo_intlen != 0);
9235                 sz = dp->dtdo_intlen * sizeof (uint64_t);
9236                 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9237                 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9238                 new->dtdo_intlen = dp->dtdo_intlen;
9239         }
9240
9241         if (dp->dtdo_vartab != NULL) {
9242                 ASSERT(dp->dtdo_varlen != 0);
9243                 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9244                 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9245                 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9246                 new->dtdo_varlen = dp->dtdo_varlen;
9247         }
9248
9249         dtrace_difo_init(new, vstate);
9250         return (new);
9251 }
9252
9253 static void
9254 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9255 {
9256         uint_t i;
9257
9258         ASSERT(dp->dtdo_refcnt == 0);
9259
9260         for (i = 0; i < dp->dtdo_varlen; i++) {
9261                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9262                 dtrace_statvar_t *svar;
9263                 dtrace_statvar_t **svarp = NULL;
9264                 uint_t id;
9265                 uint8_t scope = v->dtdv_scope;
9266                 int *np = NULL;
9267
9268                 switch (scope) {
9269                 case DIFV_SCOPE_THREAD:
9270                         continue;
9271
9272                 case DIFV_SCOPE_LOCAL:
9273                         np = &vstate->dtvs_nlocals;
9274                         svarp = vstate->dtvs_locals;
9275                         break;
9276
9277                 case DIFV_SCOPE_GLOBAL:
9278                         np = &vstate->dtvs_nglobals;
9279                         svarp = vstate->dtvs_globals;
9280                         break;
9281
9282                 default:
9283                         ASSERT(0);
9284                 }
9285
9286                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9287                         continue;
9288
9289                 id -= DIF_VAR_OTHER_UBASE;
9290
9291                 ASSERT(id < (uint_t)*np);
9292
9293                 svar = svarp[id];
9294                 ASSERT(svar != NULL);
9295                 ASSERT(svar->dtsv_refcnt > 0);
9296
9297                 if (--svar->dtsv_refcnt > 0)
9298                         continue;
9299
9300                 if (svar->dtsv_size != 0) {
9301                         ASSERT(svar->dtsv_data != 0);
9302                         kmem_free((void *)(uintptr_t)svar->dtsv_data,
9303                             svar->dtsv_size);
9304                 }
9305
9306                 kmem_free(svar, sizeof (dtrace_statvar_t));
9307                 svarp[id] = NULL;
9308         }
9309
9310         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9311         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9312         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9313         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9314
9315         kmem_free(dp, sizeof (dtrace_difo_t));
9316 }
9317
9318 static void
9319 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9320 {
9321         uint_t i;
9322
9323         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9324         ASSERT(dp->dtdo_refcnt != 0);
9325
9326         for (i = 0; i < dp->dtdo_varlen; i++) {
9327                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9328
9329                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9330                         continue;
9331
9332                 ASSERT(dtrace_vtime_references > 0);
9333                 if (--dtrace_vtime_references == 0)
9334                         dtrace_vtime_disable();
9335         }
9336
9337         if (--dp->dtdo_refcnt == 0)
9338                 dtrace_difo_destroy(dp, vstate);
9339 }
9340
9341 /*
9342  * DTrace Format Functions
9343  */
9344 static uint16_t
9345 dtrace_format_add(dtrace_state_t *state, char *str)
9346 {
9347         char *fmt, **new;
9348         uint16_t ndx, len = strlen(str) + 1;
9349
9350         fmt = kmem_zalloc(len, KM_SLEEP);
9351         bcopy(str, fmt, len);
9352
9353         for (ndx = 0; ndx < state->dts_nformats; ndx++) {
9354                 if (state->dts_formats[ndx] == NULL) {
9355                         state->dts_formats[ndx] = fmt;
9356                         return (ndx + 1);
9357                 }
9358         }
9359
9360         if (state->dts_nformats == USHRT_MAX) {
9361                 /*
9362                  * This is only likely if a denial-of-service attack is being
9363                  * attempted.  As such, it's okay to fail silently here.
9364                  */
9365                 kmem_free(fmt, len);
9366                 return (0);
9367         }
9368
9369         /*
9370          * For simplicity, we always resize the formats array to be exactly the
9371          * number of formats.
9372          */
9373         ndx = state->dts_nformats++;
9374         new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
9375
9376         if (state->dts_formats != NULL) {
9377                 ASSERT(ndx != 0);
9378                 bcopy(state->dts_formats, new, ndx * sizeof (char *));
9379                 kmem_free(state->dts_formats, ndx * sizeof (char *));
9380         }
9381
9382         state->dts_formats = new;
9383         state->dts_formats[ndx] = fmt;
9384
9385         return (ndx + 1);
9386 }
9387
9388 static void
9389 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
9390 {
9391         char *fmt;
9392
9393         ASSERT(state->dts_formats != NULL);
9394         ASSERT(format <= state->dts_nformats);
9395         ASSERT(state->dts_formats[format - 1] != NULL);
9396
9397         fmt = state->dts_formats[format - 1];
9398         kmem_free(fmt, strlen(fmt) + 1);
9399         state->dts_formats[format - 1] = NULL;
9400 }
9401
9402 static void
9403 dtrace_format_destroy(dtrace_state_t *state)
9404 {
9405         int i;
9406
9407         if (state->dts_nformats == 0) {
9408                 ASSERT(state->dts_formats == NULL);
9409                 return;
9410         }
9411
9412         ASSERT(state->dts_formats != NULL);
9413
9414         for (i = 0; i < state->dts_nformats; i++) {
9415                 char *fmt = state->dts_formats[i];
9416
9417                 if (fmt == NULL)
9418                         continue;
9419
9420                 kmem_free(fmt, strlen(fmt) + 1);
9421         }
9422
9423         kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
9424         state->dts_nformats = 0;
9425         state->dts_formats = NULL;
9426 }
9427
9428 /*
9429  * DTrace Predicate Functions
9430  */
9431 static dtrace_predicate_t *
9432 dtrace_predicate_create(dtrace_difo_t *dp)
9433 {
9434         dtrace_predicate_t *pred;
9435
9436         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9437         ASSERT(dp->dtdo_refcnt != 0);
9438
9439         pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
9440         pred->dtp_difo = dp;
9441         pred->dtp_refcnt = 1;
9442
9443         if (!dtrace_difo_cacheable(dp))
9444                 return (pred);
9445
9446         if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
9447                 /*
9448                  * This is only theoretically possible -- we have had 2^32
9449                  * cacheable predicates on this machine.  We cannot allow any
9450                  * more predicates to become cacheable:  as unlikely as it is,
9451                  * there may be a thread caching a (now stale) predicate cache
9452                  * ID. (N.B.: the temptation is being successfully resisted to
9453                  * have this cmn_err() "Holy shit -- we executed this code!")
9454                  */
9455                 return (pred);
9456         }
9457
9458         pred->dtp_cacheid = dtrace_predcache_id++;
9459
9460         return (pred);
9461 }
9462
9463 static void
9464 dtrace_predicate_hold(dtrace_predicate_t *pred)
9465 {
9466         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9467         ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
9468         ASSERT(pred->dtp_refcnt > 0);
9469
9470         pred->dtp_refcnt++;
9471 }
9472
9473 static void
9474 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
9475 {
9476         dtrace_difo_t *dp = pred->dtp_difo;
9477 #pragma unused(dp) /* __APPLE__ */
9478
9479         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9480         ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
9481         ASSERT(pred->dtp_refcnt > 0);
9482
9483         if (--pred->dtp_refcnt == 0) {
9484                 dtrace_difo_release(pred->dtp_difo, vstate);
9485                 kmem_free(pred, sizeof (dtrace_predicate_t));
9486         }
9487 }
9488
9489 /*
9490  * DTrace Action Description Functions
9491  */
9492 static dtrace_actdesc_t *
9493 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
9494     uint64_t uarg, uint64_t arg)
9495 {
9496         dtrace_actdesc_t *act;
9497
9498         ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != 0 &&
9499             arg >= KERNELBASE) || (arg == 0 && kind == DTRACEACT_PRINTA));
9500
9501         act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
9502         act->dtad_kind = kind;
9503         act->dtad_ntuple = ntuple;
9504         act->dtad_uarg = uarg;
9505         act->dtad_arg = arg;
9506         act->dtad_refcnt = 1;
9507
9508         return (act);
9509 }
9510
9511 static void
9512 dtrace_actdesc_hold(dtrace_actdesc_t *act)
9513 {
9514         ASSERT(act->dtad_refcnt >= 1);
9515         act->dtad_refcnt++;
9516 }
9517
9518 static void
9519 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9520 {
9521         dtrace_actkind_t kind = act->dtad_kind;
9522         dtrace_difo_t *dp;
9523
9524         ASSERT(act->dtad_refcnt >= 1);
9525
9526         if (--act->dtad_refcnt != 0)
9527                 return;
9528
9529         if ((dp = act->dtad_difo) != NULL)
9530                 dtrace_difo_release(dp, vstate);
9531
9532         if (DTRACEACT_ISPRINTFLIKE(kind)) {
9533                 char *str = (char *)(uintptr_t)act->dtad_arg;
9534
9535                 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9536                     (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9537
9538                 if (str != NULL)
9539                         kmem_free(str, strlen(str) + 1);
9540         }
9541
9542         kmem_free(act, sizeof (dtrace_actdesc_t));
9543 }
9544
9545 /*
9546  * DTrace ECB Functions
9547  */
9548 static dtrace_ecb_t *
9549 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9550 {
9551         dtrace_ecb_t *ecb;
9552         dtrace_epid_t epid;
9553
9554         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9555
9556         ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9557         ecb->dte_predicate = NULL;
9558         ecb->dte_probe = probe;
9559
9560         /*
9561          * The default size is the size of the default action: recording
9562          * the epid.
9563          */
9564         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9565         ecb->dte_alignment = sizeof (dtrace_epid_t);
9566
9567         epid = state->dts_epid++;
9568
9569         if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
9570                 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9571                 int necbs = state->dts_necbs << 1;
9572
9573                 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
9574
9575                 if (necbs == 0) {
9576                         ASSERT(oecbs == NULL);
9577                         necbs = 1;
9578                 }
9579
9580                 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9581
9582                 if (oecbs != NULL)
9583                         bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9584
9585                 dtrace_membar_producer();
9586                 state->dts_ecbs = ecbs;
9587
9588                 if (oecbs != NULL) {
9589                         /*
9590                          * If this state is active, we must dtrace_sync()
9591                          * before we can free the old dts_ecbs array:  we're
9592                          * coming in hot, and there may be active ring
9593                          * buffer processing (which indexes into the dts_ecbs
9594                          * array) on another CPU.
9595                          */
9596                         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9597                                 dtrace_sync();
9598
9599                         kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9600                 }
9601
9602                 dtrace_membar_producer();
9603                 state->dts_necbs = necbs;
9604         }
9605
9606         ecb->dte_state = state;
9607
9608         ASSERT(state->dts_ecbs[epid - 1] == NULL);
9609         dtrace_membar_producer();
9610         state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9611
9612         return (ecb);
9613 }
9614
9615 static int
9616 dtrace_ecb_enable(dtrace_ecb_t *ecb)
9617 {
9618         dtrace_probe_t *probe = ecb->dte_probe;
9619
9620         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
9621         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9622         ASSERT(ecb->dte_next == NULL);
9623
9624         if (probe == NULL) {
9625                 /*
9626                  * This is the NULL probe -- there's nothing to do.
9627                  */
9628             return(0);
9629         }
9630
9631         probe->dtpr_provider->dtpv_ecb_count++;
9632         if (probe->dtpr_ecb == NULL) {
9633                 dtrace_provider_t *prov = probe->dtpr_provider;
9634
9635                 /*
9636                  * We're the first ECB on this probe.
9637                  */
9638                 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9639
9640                 if (ecb->dte_predicate != NULL)
9641                         probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9642
9643                 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9644                     probe->dtpr_id, probe->dtpr_arg));
9645         } else {
9646                 /*
9647                  * This probe is already active.  Swing the last pointer to
9648                  * point to the new ECB, and issue a dtrace_sync() to assure
9649                  * that all CPUs have seen the change.
9650                  */
9651                 ASSERT(probe->dtpr_ecb_last != NULL);
9652                 probe->dtpr_ecb_last->dte_next = ecb;
9653                 probe->dtpr_ecb_last = ecb;
9654                 probe->dtpr_predcache = 0;
9655
9656                 dtrace_sync();
9657                 return(0);
9658         }
9659 }
9660
9661 static void
9662 dtrace_ecb_resize(dtrace_ecb_t *ecb)
9663 {
9664         uint32_t maxalign = sizeof (dtrace_epid_t);
9665         uint32_t align = sizeof (uint8_t), offs, diff;
9666         dtrace_action_t *act;
9667         int wastuple = 0;
9668         uint32_t aggbase = UINT32_MAX;
9669         dtrace_state_t *state = ecb->dte_state;
9670
9671         /*
9672          * If we record anything, we always record the epid.  (And we always
9673          * record it first.)
9674          */
9675         offs = sizeof (dtrace_epid_t);
9676         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9677
9678         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9679                 dtrace_recdesc_t *rec = &act->dta_rec;
9680
9681                 if ((align = rec->dtrd_alignment) > maxalign)
9682                         maxalign = align;
9683
9684                 if (!wastuple && act->dta_intuple) {
9685                         /*
9686                          * This is the first record in a tuple.  Align the
9687                          * offset to be at offset 4 in an 8-byte aligned
9688                          * block.
9689                          */
9690                         diff = offs + sizeof (dtrace_aggid_t);
9691
9692                         if ((diff = (diff & (sizeof (uint64_t) - 1))))
9693                                 offs += sizeof (uint64_t) - diff;
9694
9695                         aggbase = offs - sizeof (dtrace_aggid_t);
9696                         ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
9697                 }
9698
9699                 /*LINTED*/
9700                 if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
9701                         /*
9702                          * The current offset is not properly aligned; align it.
9703                          */
9704                         offs += align - diff;
9705                 }
9706
9707                 rec->dtrd_offset = offs;
9708
9709                 if (offs + rec->dtrd_size > ecb->dte_needed) {
9710                         ecb->dte_needed = offs + rec->dtrd_size;
9711
9712                         if (ecb->dte_needed > state->dts_needed)
9713                                 state->dts_needed = ecb->dte_needed;
9714                 }
9715
9716                 if (DTRACEACT_ISAGG(act->dta_kind)) {
9717                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9718                         dtrace_action_t *first = agg->dtag_first, *prev;
9719
9720                         ASSERT(rec->dtrd_size != 0 && first != NULL);
9721                         ASSERT(wastuple);
9722                         ASSERT(aggbase != UINT32_MAX);
9723
9724                         agg->dtag_base = aggbase;
9725
9726                         while ((prev = first->dta_prev) != NULL &&
9727                             DTRACEACT_ISAGG(prev->dta_kind)) {
9728                                 agg = (dtrace_aggregation_t *)prev;
9729                                 first = agg->dtag_first;
9730                         }
9731
9732                         if (prev != NULL) {
9733                                 offs = prev->dta_rec.dtrd_offset +
9734                                     prev->dta_rec.dtrd_size;
9735                         } else {
9736                                 offs = sizeof (dtrace_epid_t);
9737                         }
9738                         wastuple = 0;
9739                 } else {
9740                         if (!act->dta_intuple)
9741                                 ecb->dte_size = offs + rec->dtrd_size;
9742
9743                         offs += rec->dtrd_size;
9744                 }
9745
9746                 wastuple = act->dta_intuple;
9747         }
9748
9749         if ((act = ecb->dte_action) != NULL &&
9750             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9751             ecb->dte_size == sizeof (dtrace_epid_t)) {
9752                 /*
9753                  * If the size is still sizeof (dtrace_epid_t), then all
9754                  * actions store no data; set the size to 0.
9755                  */
9756                 ecb->dte_alignment = maxalign;
9757                 ecb->dte_size = 0;
9758
9759                 /*
9760                  * If the needed space is still sizeof (dtrace_epid_t), then
9761                  * all actions need no additional space; set the needed
9762                  * size to 0.
9763                  */
9764                 if (ecb->dte_needed == sizeof (dtrace_epid_t))
9765                         ecb->dte_needed = 0;
9766
9767                 return;
9768         }
9769
9770         /*
9771          * Set our alignment, and make sure that the dte_size and dte_needed
9772          * are aligned to the size of an EPID.
9773          */
9774         ecb->dte_alignment = maxalign;
9775         ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
9776             ~(sizeof (dtrace_epid_t) - 1);
9777         ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
9778             ~(sizeof (dtrace_epid_t) - 1);
9779         ASSERT(ecb->dte_size <= ecb->dte_needed);
9780 }
9781
9782 static dtrace_action_t *
9783 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9784 {
9785         dtrace_aggregation_t *agg;
9786         size_t size = sizeof (uint64_t);
9787         int ntuple = desc->dtad_ntuple;
9788         dtrace_action_t *act;
9789         dtrace_recdesc_t *frec;
9790         dtrace_aggid_t aggid;
9791         dtrace_state_t *state = ecb->dte_state;
9792
9793         agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9794         agg->dtag_ecb = ecb;
9795
9796         ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
9797
9798         switch (desc->dtad_kind) {
9799         case DTRACEAGG_MIN:
9800                 agg->dtag_initial = INT64_MAX;
9801                 agg->dtag_aggregate = dtrace_aggregate_min;
9802                 break;
9803
9804         case DTRACEAGG_MAX:
9805                 agg->dtag_initial = INT64_MIN;
9806                 agg->dtag_aggregate = dtrace_aggregate_max;
9807                 break;
9808
9809         case DTRACEAGG_COUNT:
9810                 agg->dtag_aggregate = dtrace_aggregate_count;
9811                 break;
9812
9813         case DTRACEAGG_QUANTIZE:
9814                 agg->dtag_aggregate = dtrace_aggregate_quantize;
9815                 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
9816                     sizeof (uint64_t);
9817                 break;
9818
9819         case DTRACEAGG_LQUANTIZE: {
9820                 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
9821                 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
9822
9823                 agg->dtag_initial = desc->dtad_arg;
9824                 agg->dtag_aggregate = dtrace_aggregate_lquantize;
9825
9826                 if (step == 0 || levels == 0)
9827                         goto err;
9828
9829                 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
9830                 break;
9831         }
9832
9833         case DTRACEAGG_LLQUANTIZE: {
9834                 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
9835                 uint16_t low    = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
9836                 uint16_t high   = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
9837                 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
9838                 int64_t v;
9839
9840                 agg->dtag_initial = desc->dtad_arg;
9841                 agg->dtag_aggregate = dtrace_aggregate_llquantize;
9842
9843                 if (factor < 2 || low >= high || nsteps < factor)
9844                         goto err;
9845
9846                 /*
9847                  * Now check that the number of steps evenly divides a power
9848                  * of the factor.  (This assures both integer bucket size and
9849                  * linearity within each magnitude.)
9850                  */
9851                 for (v = factor; v < nsteps; v *= factor)
9852                         continue;
9853
9854                 if ((v % nsteps) || (nsteps % factor))
9855                         goto err;
9856
9857                 size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
9858                 break;
9859   }
9860
9861         case DTRACEAGG_AVG:
9862                 agg->dtag_aggregate = dtrace_aggregate_avg;
9863                 size = sizeof (uint64_t) * 2;
9864                 break;
9865
9866         case DTRACEAGG_STDDEV:
9867                 agg->dtag_aggregate = dtrace_aggregate_stddev;
9868                 size = sizeof (uint64_t) * 4;
9869                 break;
9870
9871         case DTRACEAGG_SUM:
9872                 agg->dtag_aggregate = dtrace_aggregate_sum;
9873                 break;
9874
9875         default:
9876                 goto err;
9877         }
9878
9879         agg->dtag_action.dta_rec.dtrd_size = size;
9880
9881         if (ntuple == 0)
9882                 goto err;
9883
9884         /*
9885          * We must make sure that we have enough actions for the n-tuple.
9886          */
9887         for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
9888                 if (DTRACEACT_ISAGG(act->dta_kind))
9889                         break;
9890
9891                 if (--ntuple == 0) {
9892                         /*
9893                          * This is the action with which our n-tuple begins.
9894                          */
9895                         agg->dtag_first = act;
9896                         goto success;
9897                 }
9898         }
9899
9900         /*
9901          * This n-tuple is short by ntuple elements.  Return failure.
9902          */
9903         ASSERT(ntuple != 0);
9904 err:
9905         kmem_free(agg, sizeof (dtrace_aggregation_t));
9906         return (NULL);
9907
9908 success:
9909         /*
9910          * If the last action in the tuple has a size of zero, it's actually
9911          * an expression argument for the aggregating action.
9912          */
9913         ASSERT(ecb->dte_action_last != NULL);
9914         act = ecb->dte_action_last;
9915
9916         if (act->dta_kind == DTRACEACT_DIFEXPR) {
9917                 ASSERT(act->dta_difo != NULL);
9918
9919                 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
9920                         agg->dtag_hasarg = 1;
9921         }
9922
9923         /*
9924          * We need to allocate an id for this aggregation.
9925          */
9926         aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
9927             VM_BESTFIT | VM_SLEEP);
9928
9929         if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
9930                 dtrace_aggregation_t **oaggs = state->dts_aggregations;
9931                 dtrace_aggregation_t **aggs;
9932                 int naggs = state->dts_naggregations << 1;
9933                 int onaggs = state->dts_naggregations;
9934
9935                 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
9936
9937                 if (naggs == 0) {
9938                         ASSERT(oaggs == NULL);
9939                         naggs = 1;
9940                 }
9941
9942                 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
9943
9944                 if (oaggs != NULL) {
9945                         bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
9946                         kmem_free(oaggs, onaggs * sizeof (*aggs));
9947                 }
9948
9949                 state->dts_aggregations = aggs;
9950                 state->dts_naggregations = naggs;
9951         }
9952
9953         ASSERT(state->dts_aggregations[aggid - 1] == NULL);
9954         state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
9955
9956         frec = &agg->dtag_first->dta_rec;
9957         if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
9958                 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
9959
9960         for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
9961                 ASSERT(!act->dta_intuple);
9962                 act->dta_intuple = 1;
9963         }
9964
9965         return (&agg->dtag_action);
9966 }
9967
9968 static void
9969 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
9970 {
9971         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9972         dtrace_state_t *state = ecb->dte_state;
9973         dtrace_aggid_t aggid = agg->dtag_id;
9974
9975         ASSERT(DTRACEACT_ISAGG(act->dta_kind));
9976         vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
9977
9978         ASSERT(state->dts_aggregations[aggid - 1] == agg);
9979         state->dts_aggregations[aggid - 1] = NULL;
9980
9981         kmem_free(agg, sizeof (dtrace_aggregation_t));
9982 }
9983
9984 static int
9985 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9986 {
9987         dtrace_action_t *action, *last;
9988         dtrace_difo_t *dp = desc->dtad_difo;
9989         uint32_t size = 0, align = sizeof (uint8_t), mask;
9990         uint16_t format = 0;
9991         dtrace_recdesc_t *rec;
9992         dtrace_state_t *state = ecb->dte_state;
9993         dtrace_optval_t *opt = state->dts_options;
9994         dtrace_optval_t nframes=0, strsize;
9995         uint64_t arg = desc->dtad_arg;
9996
9997         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9998         ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
9999
10000         if (DTRACEACT_ISAGG(desc->dtad_kind)) {
10001                 /*
10002                  * If this is an aggregating action, there must be neither
10003                  * a speculate nor a commit on the action chain.
10004                  */
10005                 dtrace_action_t *act;
10006
10007                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10008                         if (act->dta_kind == DTRACEACT_COMMIT)
10009                                 return (EINVAL);
10010
10011                         if (act->dta_kind == DTRACEACT_SPECULATE)
10012                                 return (EINVAL);
10013                 }
10014
10015                 action = dtrace_ecb_aggregation_create(ecb, desc);
10016
10017                 if (action == NULL)
10018                         return (EINVAL);
10019         } else {
10020                 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
10021                     (desc->dtad_kind == DTRACEACT_DIFEXPR &&
10022                     dp != NULL && dp->dtdo_destructive)) {
10023                         state->dts_destructive = 1;
10024                 }
10025
10026                 switch (desc->dtad_kind) {
10027                 case DTRACEACT_PRINTF:
10028                 case DTRACEACT_PRINTA:
10029                 case DTRACEACT_SYSTEM:
10030                 case DTRACEACT_FREOPEN:
10031                         /*
10032                          * We know that our arg is a string -- turn it into a
10033                          * format.
10034                          */
10035                         if (arg == 0) {
10036                                 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
10037                                 format = 0;
10038                         } else {
10039                                 ASSERT(arg != 0);
10040                                 ASSERT(arg > KERNELBASE);
10041                                 format = dtrace_format_add(state,
10042                                     (char *)(uintptr_t)arg);
10043                         }
10044
10045                         /*FALLTHROUGH*/
10046                 case DTRACEACT_LIBACT:
10047                 case DTRACEACT_DIFEXPR:
10048                 case DTRACEACT_TRACEMEM:
10049                 case DTRACEACT_TRACEMEM_DYNSIZE:
10050                 case DTRACEACT_APPLEBINARY:     /* __APPLE__ */
10051                         if (dp == NULL)
10052                                 return (EINVAL);
10053
10054                         if ((size = dp->dtdo_rtype.dtdt_size) != 0)
10055                                 break;
10056
10057                         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
10058                                 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10059                                         return (EINVAL);
10060
10061                                 size = opt[DTRACEOPT_STRSIZE];
10062                         }
10063
10064                         break;
10065
10066                 case DTRACEACT_STACK:
10067                         if ((nframes = arg) == 0) {
10068                                 nframes = opt[DTRACEOPT_STACKFRAMES];
10069                                 ASSERT(nframes > 0);
10070                                 arg = nframes;
10071                         }
10072
10073                         size = nframes * sizeof (pc_t);
10074                         break;
10075
10076                 case DTRACEACT_JSTACK:
10077                         if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
10078                                 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
10079
10080                         if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
10081                                 nframes = opt[DTRACEOPT_JSTACKFRAMES];
10082
10083                         arg = DTRACE_USTACK_ARG(nframes, strsize);
10084
10085                         /*FALLTHROUGH*/
10086                 case DTRACEACT_USTACK:
10087                         if (desc->dtad_kind != DTRACEACT_JSTACK &&
10088                             (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
10089                                 strsize = DTRACE_USTACK_STRSIZE(arg);
10090                                 nframes = opt[DTRACEOPT_USTACKFRAMES];
10091                                 ASSERT(nframes > 0);
10092                                 arg = DTRACE_USTACK_ARG(nframes, strsize);
10093                         }
10094
10095                         /*
10096                          * Save a slot for the pid.
10097                          */
10098                         size = (nframes + 1) * sizeof (uint64_t);
10099                         size += DTRACE_USTACK_STRSIZE(arg);
10100                         size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10101
10102                         break;
10103
10104                 case DTRACEACT_SYM:
10105                 case DTRACEACT_MOD:
10106                         if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
10107                             sizeof (uint64_t)) ||
10108                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10109                                 return (EINVAL);
10110                         break;
10111
10112                 case DTRACEACT_USYM:
10113                 case DTRACEACT_UMOD:
10114                 case DTRACEACT_UADDR:
10115                         if (dp == NULL ||
10116                             (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
10117                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10118                                 return (EINVAL);
10119
10120                         /*
10121                          * We have a slot for the pid, plus a slot for the
10122                          * argument.  To keep things simple (aligned with
10123                          * bitness-neutral sizing), we store each as a 64-bit
10124                          * quantity.
10125                          */
10126                         size = 2 * sizeof (uint64_t);
10127                         break;
10128
10129                 case DTRACEACT_STOP:
10130                 case DTRACEACT_BREAKPOINT:
10131                 case DTRACEACT_PANIC:
10132                         break;
10133
10134                 case DTRACEACT_CHILL:
10135                 case DTRACEACT_DISCARD:
10136                 case DTRACEACT_RAISE:
10137                 case DTRACEACT_PIDRESUME:       /* __APPLE__ */
10138                         if (dp == NULL)
10139                                 return (EINVAL);
10140                         break;
10141
10142                 case DTRACEACT_EXIT:
10143                         if (dp == NULL ||
10144                             (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
10145                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10146                                 return (EINVAL);
10147                         break;
10148
10149                 case DTRACEACT_SPECULATE:
10150                         if (ecb->dte_size > sizeof (dtrace_epid_t))
10151                                 return (EINVAL);
10152
10153                         if (dp == NULL)
10154                                 return (EINVAL);
10155
10156                         state->dts_speculates = 1;
10157                         break;
10158
10159                 case DTRACEACT_COMMIT: {
10160                         dtrace_action_t *act = ecb->dte_action;
10161
10162                         for (; act != NULL; act = act->dta_next) {
10163                                 if (act->dta_kind == DTRACEACT_COMMIT)
10164                                         return (EINVAL);
10165                         }
10166
10167                         if (dp == NULL)
10168                                 return (EINVAL);
10169                         break;
10170                 }
10171
10172                 default:
10173                         return (EINVAL);
10174                 }
10175
10176                 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10177                         /*
10178                          * If this is a data-storing action or a speculate,
10179                          * we must be sure that there isn't a commit on the
10180                          * action chain.
10181                          */
10182                         dtrace_action_t *act = ecb->dte_action;
10183
10184                         for (; act != NULL; act = act->dta_next) {
10185                                 if (act->dta_kind == DTRACEACT_COMMIT)
10186                                         return (EINVAL);
10187                         }
10188                 }
10189
10190                 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10191                 action->dta_rec.dtrd_size = size;
10192         }
10193
10194         action->dta_refcnt = 1;
10195         rec = &action->dta_rec;
10196         size = rec->dtrd_size;
10197
10198         for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
10199                 if (!(size & mask)) {
10200                         align = mask + 1;
10201                         break;
10202                 }
10203         }
10204
10205         action->dta_kind = desc->dtad_kind;
10206
10207         if ((action->dta_difo = dp) != NULL)
10208                 dtrace_difo_hold(dp);
10209
10210         rec->dtrd_action = action->dta_kind;
10211         rec->dtrd_arg = arg;
10212         rec->dtrd_uarg = desc->dtad_uarg;
10213         rec->dtrd_alignment = (uint16_t)align;
10214         rec->dtrd_format = format;
10215
10216         if ((last = ecb->dte_action_last) != NULL) {
10217                 ASSERT(ecb->dte_action != NULL);
10218                 action->dta_prev = last;
10219                 last->dta_next = action;
10220         } else {
10221                 ASSERT(ecb->dte_action == NULL);
10222                 ecb->dte_action = action;
10223         }
10224
10225         ecb->dte_action_last = action;
10226
10227         return (0);
10228 }
10229
10230 static void
10231 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10232 {
10233         dtrace_action_t *act = ecb->dte_action, *next;
10234         dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10235         dtrace_difo_t *dp;
10236         uint16_t format;
10237
10238         if (act != NULL && act->dta_refcnt > 1) {
10239                 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
10240                 act->dta_refcnt--;
10241         } else {
10242                 for (; act != NULL; act = next) {
10243                         next = act->dta_next;
10244                         ASSERT(next != NULL || act == ecb->dte_action_last);
10245                         ASSERT(act->dta_refcnt == 1);
10246
10247                         if ((format = act->dta_rec.dtrd_format) != 0)
10248                                 dtrace_format_remove(ecb->dte_state, format);
10249
10250                         if ((dp = act->dta_difo) != NULL)
10251                                 dtrace_difo_release(dp, vstate);
10252
10253                         if (DTRACEACT_ISAGG(act->dta_kind)) {
10254                                 dtrace_ecb_aggregation_destroy(ecb, act);
10255                         } else {
10256                                 kmem_free(act, sizeof (dtrace_action_t));
10257                         }
10258                 }
10259         }
10260
10261         ecb->dte_action = NULL;
10262         ecb->dte_action_last = NULL;
10263         ecb->dte_size = sizeof (dtrace_epid_t);
10264 }
10265
10266 static void
10267 dtrace_ecb_disable(dtrace_ecb_t *ecb)
10268 {
10269         /*
10270          * We disable the ECB by removing it from its probe.
10271          */
10272         dtrace_ecb_t *pecb, *prev = NULL;
10273         dtrace_probe_t *probe = ecb->dte_probe;
10274
10275         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10276
10277         if (probe == NULL) {
10278                 /*
10279                  * This is the NULL probe; there is nothing to disable.
10280                  */
10281                 return;
10282         }
10283
10284         for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10285                 if (pecb == ecb)
10286                         break;
10287                 prev = pecb;
10288         }
10289
10290         ASSERT(pecb != NULL);
10291
10292         if (prev == NULL) {
10293                 probe->dtpr_ecb = ecb->dte_next;
10294         } else {
10295                 prev->dte_next = ecb->dte_next;
10296         }
10297
10298         if (ecb == probe->dtpr_ecb_last) {
10299                 ASSERT(ecb->dte_next == NULL);
10300                 probe->dtpr_ecb_last = prev;
10301         }
10302
10303         probe->dtpr_provider->dtpv_ecb_count--;
10304         /*
10305          * The ECB has been disconnected from the probe; now sync to assure
10306          * that all CPUs have seen the change before returning.
10307          */
10308         dtrace_sync();
10309
10310         if (probe->dtpr_ecb == NULL) {
10311                 /*
10312                  * That was the last ECB on the probe; clear the predicate
10313                  * cache ID for the probe, disable it and sync one more time
10314                  * to assure that we'll never hit it again.
10315                  */
10316                 dtrace_provider_t *prov = probe->dtpr_provider;
10317
10318                 ASSERT(ecb->dte_next == NULL);
10319                 ASSERT(probe->dtpr_ecb_last == NULL);
10320                 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10321                 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10322                     probe->dtpr_id, probe->dtpr_arg);
10323                 dtrace_sync();
10324         } else {
10325                 /*
10326                  * There is at least one ECB remaining on the probe.  If there
10327                  * is _exactly_ one, set the probe's predicate cache ID to be
10328                  * the predicate cache ID of the remaining ECB.
10329                  */
10330                 ASSERT(probe->dtpr_ecb_last != NULL);
10331                 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10332
10333                 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10334                         dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10335
10336                         ASSERT(probe->dtpr_ecb->dte_next == NULL);
10337
10338                         if (p != NULL)
10339                                 probe->dtpr_predcache = p->dtp_cacheid;
10340                 }
10341
10342                 ecb->dte_next = NULL;
10343         }
10344 }
10345
10346 static void
10347 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10348 {
10349         dtrace_state_t *state = ecb->dte_state;
10350         dtrace_vstate_t *vstate = &state->dts_vstate;
10351         dtrace_predicate_t *pred;
10352         dtrace_epid_t epid = ecb->dte_epid;
10353
10354         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10355         ASSERT(ecb->dte_next == NULL);
10356         ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10357
10358         if ((pred = ecb->dte_predicate) != NULL)
10359                 dtrace_predicate_release(pred, vstate);
10360
10361         dtrace_ecb_action_remove(ecb);
10362
10363         ASSERT(state->dts_ecbs[epid - 1] == ecb);
10364         state->dts_ecbs[epid - 1] = NULL;
10365
10366         kmem_free(ecb, sizeof (dtrace_ecb_t));
10367 }
10368
10369 static dtrace_ecb_t *
10370 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
10371     dtrace_enabling_t *enab)
10372 {
10373         dtrace_ecb_t *ecb;
10374         dtrace_predicate_t *pred;
10375         dtrace_actdesc_t *act;
10376         dtrace_provider_t *prov;
10377         dtrace_ecbdesc_t *desc = enab->dten_current;
10378
10379         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10380         ASSERT(state != NULL);
10381
10382         ecb = dtrace_ecb_add(state, probe);
10383         ecb->dte_uarg = desc->dted_uarg;
10384
10385         if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
10386                 dtrace_predicate_hold(pred);
10387                 ecb->dte_predicate = pred;
10388         }
10389
10390         if (probe != NULL) {
10391                 /*
10392                  * If the provider shows more leg than the consumer is old
10393                  * enough to see, we need to enable the appropriate implicit
10394                  * predicate bits to prevent the ecb from activating at
10395                  * revealing times.
10396                  *
10397                  * Providers specifying DTRACE_PRIV_USER at register time
10398                  * are stating that they need the /proc-style privilege
10399                  * model to be enforced, and this is what DTRACE_COND_OWNER
10400                  * and DTRACE_COND_ZONEOWNER will then do at probe time.
10401                  */
10402                 prov = probe->dtpr_provider;
10403                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
10404                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10405                         ecb->dte_cond |= DTRACE_COND_OWNER;
10406
10407                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
10408                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10409                         ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
10410
10411                 /*
10412                  * If the provider shows us kernel innards and the user
10413                  * is lacking sufficient privilege, enable the
10414                  * DTRACE_COND_USERMODE implicit predicate.
10415                  */
10416                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
10417                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
10418                         ecb->dte_cond |= DTRACE_COND_USERMODE;
10419         }
10420
10421         if (dtrace_ecb_create_cache != NULL) {
10422                 /*
10423                  * If we have a cached ecb, we'll use its action list instead
10424                  * of creating our own (saving both time and space).
10425                  */
10426                 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
10427                 dtrace_action_t *act_if = cached->dte_action;
10428
10429                 if (act_if != NULL) {
10430                         ASSERT(act_if->dta_refcnt > 0);
10431                         act_if->dta_refcnt++;
10432                         ecb->dte_action = act_if;
10433                         ecb->dte_action_last = cached->dte_action_last;
10434                         ecb->dte_needed = cached->dte_needed;
10435                         ecb->dte_size = cached->dte_size;
10436                         ecb->dte_alignment = cached->dte_alignment;
10437                 }
10438
10439                 return (ecb);
10440         }
10441
10442         for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
10443                 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
10444                         dtrace_ecb_destroy(ecb);
10445                         return (NULL);
10446                 }
10447         }
10448
10449         dtrace_ecb_resize(ecb);
10450
10451         return (dtrace_ecb_create_cache = ecb);
10452 }
10453
10454 static int
10455 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
10456 {
10457         dtrace_ecb_t *ecb;
10458         dtrace_enabling_t *enab = arg;
10459         dtrace_state_t *state = enab->dten_vstate->dtvs_state;
10460
10461         ASSERT(state != NULL);
10462
10463         if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
10464                 /*
10465                  * This probe was created in a generation for which this
10466                  * enabling has previously created ECBs; we don't want to
10467                  * enable it again, so just kick out.
10468                  */
10469                 return (DTRACE_MATCH_NEXT);
10470         }
10471
10472         if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
10473                 return (DTRACE_MATCH_DONE);
10474
10475         if (dtrace_ecb_enable(ecb) < 0)
10476                return (DTRACE_MATCH_FAIL);
10477
10478         return (DTRACE_MATCH_NEXT);
10479 }
10480
10481 static dtrace_ecb_t *
10482 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
10483 {
10484         dtrace_ecb_t *ecb;
10485 #pragma unused(ecb) /* __APPLE__ */
10486
10487         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10488
10489         if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
10490                 return (NULL);
10491
10492         ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
10493         ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
10494
10495         return (state->dts_ecbs[id - 1]);
10496 }
10497
10498 static dtrace_aggregation_t *
10499 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
10500 {
10501         dtrace_aggregation_t *agg;
10502 #pragma unused(agg) /* __APPLE__ */
10503
10504         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10505
10506         if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
10507                 return (NULL);
10508
10509         ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
10510         ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
10511             agg->dtag_id == id);
10512
10513         return (state->dts_aggregations[id - 1]);
10514 }
10515
10516 /*
10517  * DTrace Buffer Functions
10518  *
10519  * The following functions manipulate DTrace buffers.  Most of these functions
10520  * are called in the context of establishing or processing consumer state;
10521  * exceptions are explicitly noted.
10522  */
10523
10524 /*
10525  * Note:  called from cross call context.  This function switches the two
10526  * buffers on a given CPU.  The atomicity of this operation is assured by
10527  * disabling interrupts while the actual switch takes place; the disabling of
10528  * interrupts serializes the execution with any execution of dtrace_probe() on
10529  * the same CPU.
10530  */
10531 static void
10532 dtrace_buffer_switch(dtrace_buffer_t *buf)
10533 {
10534         caddr_t tomax = buf->dtb_tomax;
10535         caddr_t xamot = buf->dtb_xamot;
10536         dtrace_icookie_t cookie;
10537
10538         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10539         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
10540
10541         cookie = dtrace_interrupt_disable();
10542         buf->dtb_tomax = xamot;
10543         buf->dtb_xamot = tomax;
10544         buf->dtb_xamot_drops = buf->dtb_drops;
10545         buf->dtb_xamot_offset = buf->dtb_offset;
10546         buf->dtb_xamot_errors = buf->dtb_errors;
10547         buf->dtb_xamot_flags = buf->dtb_flags;
10548         buf->dtb_offset = 0;
10549         buf->dtb_drops = 0;
10550         buf->dtb_errors = 0;
10551         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
10552         dtrace_interrupt_enable(cookie);
10553 }
10554
10555 /*
10556  * Note:  called from cross call context.  This function activates a buffer
10557  * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
10558  * is guaranteed by the disabling of interrupts.
10559  */
10560 static void
10561 dtrace_buffer_activate(dtrace_state_t *state)
10562 {
10563         dtrace_buffer_t *buf;
10564         dtrace_icookie_t cookie = dtrace_interrupt_disable();
10565
10566         buf = &state->dts_buffer[CPU->cpu_id];
10567
10568         if (buf->dtb_tomax != NULL) {
10569                 /*
10570                  * We might like to assert that the buffer is marked inactive,
10571                  * but this isn't necessarily true:  the buffer for the CPU
10572                  * that processes the BEGIN probe has its buffer activated
10573                  * manually.  In this case, we take the (harmless) action
10574                  * re-clearing the bit INACTIVE bit.
10575                  */
10576                 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10577         }
10578
10579         dtrace_interrupt_enable(cookie);
10580 }
10581
10582 static int
10583 dtrace_buffer_canalloc(size_t size)
10584 {
10585         if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
10586                 return (B_FALSE);
10587         if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
10588                 return (B_FALSE);
10589
10590         return (B_TRUE);
10591 }
10592
10593 static int
10594 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10595     processorid_t cpu)
10596 {
10597         dtrace_cpu_t *cp;
10598         dtrace_buffer_t *buf;
10599         size_t size_before_alloc = dtrace_buffer_memory_inuse;
10600
10601         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
10602         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10603
10604         if (size > (size_t)dtrace_nonroot_maxsize &&
10605             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
10606                 return (EFBIG);
10607
10608         cp = cpu_list;
10609
10610         do {
10611                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10612                         continue;
10613
10614                 buf = &bufs[cp->cpu_id];
10615
10616                 /*
10617                  * If there is already a buffer allocated for this CPU, it
10618                  * is only possible that this is a DR event.  In this case,
10619                  * the buffer size must match our specified size.
10620                  */
10621                 if (buf->dtb_tomax != NULL) {
10622                         ASSERT(buf->dtb_size == size);
10623                         continue;
10624                 }
10625
10626                 ASSERT(buf->dtb_xamot == NULL);
10627
10628                 /* DTrace, please do not eat all the memory. */
10629                 if (dtrace_buffer_canalloc(size) == B_FALSE)
10630                         goto err;
10631                 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10632                         goto err;
10633                 dtrace_buffer_memory_inuse += size;
10634
10635                 buf->dtb_size = size;
10636                 buf->dtb_flags = flags;
10637                 buf->dtb_offset = 0;
10638                 buf->dtb_drops = 0;
10639
10640                 if (flags & DTRACEBUF_NOSWITCH)
10641                         continue;
10642
10643                 /* DTrace, please do not eat all the memory. */
10644                 if (dtrace_buffer_canalloc(size) == B_FALSE)
10645                         goto err;
10646                 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10647                         goto err;
10648                 dtrace_buffer_memory_inuse += size;
10649         } while ((cp = cp->cpu_next) != cpu_list);
10650
10651         ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
10652
10653         return (0);
10654
10655 err:
10656         cp = cpu_list;
10657
10658         do {
10659                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10660                         continue;
10661
10662                 buf = &bufs[cp->cpu_id];
10663
10664                 if (buf->dtb_xamot != NULL) {
10665                         ASSERT(buf->dtb_tomax != NULL);
10666                         ASSERT(buf->dtb_size == size);
10667                         kmem_free(buf->dtb_xamot, size);
10668                 }
10669
10670                 if (buf->dtb_tomax != NULL) {
10671                         ASSERT(buf->dtb_size == size);
10672                         kmem_free(buf->dtb_tomax, size);
10673                 }
10674
10675                 buf->dtb_tomax = NULL;
10676                 buf->dtb_xamot = NULL;
10677                 buf->dtb_size = 0;
10678         } while ((cp = cp->cpu_next) != cpu_list);
10679
10680         /* Restore the size saved before allocating memory */
10681         dtrace_buffer_memory_inuse = size_before_alloc;
10682
10683         return (ENOMEM);
10684 }
10685
10686 /*
10687  * Note:  called from probe context.  This function just increments the drop
10688  * count on a buffer.  It has been made a function to allow for the
10689  * possibility of understanding the source of mysterious drop counts.  (A
10690  * problem for which one may be particularly disappointed that DTrace cannot
10691  * be used to understand DTrace.)
10692  */
10693 static void
10694 dtrace_buffer_drop(dtrace_buffer_t *buf)
10695 {
10696         buf->dtb_drops++;
10697 }
10698
10699 /*
10700  * Note:  called from probe context.  This function is called to reserve space
10701  * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
10702  * mstate.  Returns the new offset in the buffer, or a negative value if an
10703  * error has occurred.
10704  */
10705 static intptr_t
10706 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10707     dtrace_state_t *state, dtrace_mstate_t *mstate)
10708 {
10709         intptr_t offs = buf->dtb_offset, soffs;
10710         intptr_t woffs;
10711         caddr_t tomax;
10712         size_t total_off;
10713
10714         if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10715                 return (-1);
10716
10717         if ((tomax = buf->dtb_tomax) == NULL) {
10718                 dtrace_buffer_drop(buf);
10719                 return (-1);
10720         }
10721
10722         if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
10723                 while (offs & (align - 1)) {
10724                         /*
10725                          * Assert that our alignment is off by a number which
10726                          * is itself sizeof (uint32_t) aligned.
10727                          */
10728                         ASSERT(!((align - (offs & (align - 1))) &
10729                             (sizeof (uint32_t) - 1)));
10730                         DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10731                         offs += sizeof (uint32_t);
10732                 }
10733
10734                 if ((uint64_t)(soffs = offs + needed) > buf->dtb_size) {
10735                         dtrace_buffer_drop(buf);
10736                         return (-1);
10737                 }
10738
10739                 if (mstate == NULL)
10740                         return (offs);
10741
10742                 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
10743                 mstate->dtms_scratch_size = buf->dtb_size - soffs;
10744                 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10745
10746                 return (offs);
10747         }
10748
10749         if (buf->dtb_flags & DTRACEBUF_FILL) {
10750                 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
10751                     (buf->dtb_flags & DTRACEBUF_FULL))
10752                         return (-1);
10753                 goto out;
10754         }
10755
10756         total_off = needed + (offs & (align - 1));
10757
10758         /*
10759          * For a ring buffer, life is quite a bit more complicated.  Before
10760          * we can store any padding, we need to adjust our wrapping offset.
10761          * (If we've never before wrapped or we're not about to, no adjustment
10762          * is required.)
10763          */
10764         if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
10765             offs + total_off > buf->dtb_size) {
10766                 woffs = buf->dtb_xamot_offset;
10767
10768                 if (offs + total_off > buf->dtb_size) {
10769                         /*
10770                          * We can't fit in the end of the buffer.  First, a
10771                          * sanity check that we can fit in the buffer at all.
10772                          */
10773                         if (total_off > buf->dtb_size) {
10774                                 dtrace_buffer_drop(buf);
10775                                 return (-1);
10776                         }
10777
10778                         /*
10779                          * We're going to be storing at the top of the buffer,
10780                          * so now we need to deal with the wrapped offset.  We
10781                          * only reset our wrapped offset to 0 if it is
10782                          * currently greater than the current offset.  If it
10783                          * is less than the current offset, it is because a
10784                          * previous allocation induced a wrap -- but the
10785                          * allocation didn't subsequently take the space due
10786                          * to an error or false predicate evaluation.  In this
10787                          * case, we'll just leave the wrapped offset alone: if
10788                          * the wrapped offset hasn't been advanced far enough
10789                          * for this allocation, it will be adjusted in the
10790                          * lower loop.
10791                          */
10792                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
10793                                 if (woffs >= offs)
10794                                         woffs = 0;
10795                         } else {
10796                                 woffs = 0;
10797                         }
10798
10799                         /*
10800                          * Now we know that we're going to be storing to the
10801                          * top of the buffer and that there is room for us
10802                          * there.  We need to clear the buffer from the current
10803                          * offset to the end (there may be old gunk there).
10804                          */
10805                         while ((uint64_t)offs < buf->dtb_size)
10806                                 tomax[offs++] = 0;
10807
10808                         /*
10809                          * We need to set our offset to zero.  And because we
10810                          * are wrapping, we need to set the bit indicating as
10811                          * much.  We can also adjust our needed space back
10812                          * down to the space required by the ECB -- we know
10813                          * that the top of the buffer is aligned.
10814                          */
10815                         offs = 0;
10816                         total_off = needed;
10817                         buf->dtb_flags |= DTRACEBUF_WRAPPED;
10818                 } else {
10819                         /*
10820                          * There is room for us in the buffer, so we simply
10821                          * need to check the wrapped offset.
10822                          */
10823                         if (woffs < offs) {
10824                                 /*
10825                                  * The wrapped offset is less than the offset.
10826                                  * This can happen if we allocated buffer space
10827                                  * that induced a wrap, but then we didn't
10828                                  * subsequently take the space due to an error
10829                                  * or false predicate evaluation.  This is
10830                                  * okay; we know that _this_ allocation isn't
10831                                  * going to induce a wrap.  We still can't
10832                                  * reset the wrapped offset to be zero,
10833                                  * however: the space may have been trashed in
10834                                  * the previous failed probe attempt.  But at
10835                                  * least the wrapped offset doesn't need to
10836                                  * be adjusted at all...
10837                                  */
10838                                 goto out;
10839                         }
10840                 }
10841
10842                 while (offs + total_off > (size_t)woffs) {
10843                         dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
10844                         size_t size;
10845
10846                         if (epid == DTRACE_EPIDNONE) {
10847                                 size = sizeof (uint32_t);
10848                         } else {
10849                                 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
10850                                 ASSERT(state->dts_ecbs[epid - 1] != NULL);
10851
10852                                 size = state->dts_ecbs[epid - 1]->dte_size;
10853                         }
10854
10855                         ASSERT(woffs + size <= buf->dtb_size);
10856                         ASSERT(size != 0);
10857
10858                         if (woffs + size == buf->dtb_size) {
10859                                 /*
10860                                  * We've reached the end of the buffer; we want
10861                                  * to set the wrapped offset to 0 and break
10862                                  * out.  However, if the offs is 0, then we're
10863                                  * in a strange edge-condition:  the amount of
10864                                  * space that we want to reserve plus the size
10865                                  * of the record that we're overwriting is
10866                                  * greater than the size of the buffer.  This
10867                                  * is problematic because if we reserve the
10868                                  * space but subsequently don't consume it (due
10869                                  * to a failed predicate or error) the wrapped
10870                                  * offset will be 0 -- yet the EPID at offset 0
10871                                  * will not be committed.  This situation is
10872                                  * relatively easy to deal with:  if we're in
10873                                  * this case, the buffer is indistinguishable
10874                                  * from one that hasn't wrapped; we need only
10875                                  * finish the job by clearing the wrapped bit,
10876                                  * explicitly setting the offset to be 0, and
10877                                  * zero'ing out the old data in the buffer.
10878                                  */
10879                                 if (offs == 0) {
10880                                         buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
10881                                         buf->dtb_offset = 0;
10882                                         woffs = total_off;
10883
10884                                         while ((uint64_t)woffs < buf->dtb_size)
10885                                                 tomax[woffs++] = 0;
10886                                 }
10887
10888                                 woffs = 0;
10889                                 break;
10890                         }
10891
10892                         woffs += size;
10893                 }
10894
10895                 /*
10896                  * We have a wrapped offset.  It may be that the wrapped offset
10897                  * has become zero -- that's okay.
10898                  */
10899                 buf->dtb_xamot_offset = woffs;
10900         }
10901
10902 out:
10903         /*
10904          * Now we can plow the buffer with any necessary padding.
10905          */
10906         while (offs & (align - 1)) {
10907                 /*
10908                  * Assert that our alignment is off by a number which
10909                  * is itself sizeof (uint32_t) aligned.
10910                  */
10911                 ASSERT(!((align - (offs & (align - 1))) &
10912                     (sizeof (uint32_t) - 1)));
10913                 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10914                 offs += sizeof (uint32_t);
10915         }
10916
10917         if (buf->dtb_flags & DTRACEBUF_FILL) {
10918                 if (offs + needed > buf->dtb_size - state->dts_reserve) {
10919                         buf->dtb_flags |= DTRACEBUF_FULL;
10920                         return (-1);
10921                 }
10922         }
10923
10924         if (mstate == NULL)
10925                 return (offs);
10926
10927         /*
10928          * For ring buffers and fill buffers, the scratch space is always
10929          * the inactive buffer.
10930          */
10931         mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
10932         mstate->dtms_scratch_size = buf->dtb_size;
10933         mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10934
10935         return (offs);
10936 }
10937
10938 static void
10939 dtrace_buffer_polish(dtrace_buffer_t *buf)
10940 {
10941         ASSERT(buf->dtb_flags & DTRACEBUF_RING);
10942         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10943
10944         if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
10945                 return;
10946
10947         /*
10948          * We need to polish the ring buffer.  There are three cases:
10949          *
10950          * - The first (and presumably most common) is that there is no gap
10951          *   between the buffer offset and the wrapped offset.  In this case,
10952          *   there is nothing in the buffer that isn't valid data; we can
10953          *   mark the buffer as polished and return.
10954          *
10955          * - The second (less common than the first but still more common
10956          *   than the third) is that there is a gap between the buffer offset
10957          *   and the wrapped offset, and the wrapped offset is larger than the
10958          *   buffer offset.  This can happen because of an alignment issue, or
10959          *   can happen because of a call to dtrace_buffer_reserve() that
10960          *   didn't subsequently consume the buffer space.  In this case,
10961          *   we need to zero the data from the buffer offset to the wrapped
10962          *   offset.
10963          *
10964          * - The third (and least common) is that there is a gap between the
10965          *   buffer offset and the wrapped offset, but the wrapped offset is
10966          *   _less_ than the buffer offset.  This can only happen because a
10967          *   call to dtrace_buffer_reserve() induced a wrap, but the space
10968          *   was not subsequently consumed.  In this case, we need to zero the
10969          *   space from the offset to the end of the buffer _and_ from the
10970          *   top of the buffer to the wrapped offset.
10971          */
10972         if (buf->dtb_offset < buf->dtb_xamot_offset) {
10973                 bzero(buf->dtb_tomax + buf->dtb_offset,
10974                     buf->dtb_xamot_offset - buf->dtb_offset);
10975         }
10976
10977         if (buf->dtb_offset > buf->dtb_xamot_offset) {
10978                 bzero(buf->dtb_tomax + buf->dtb_offset,
10979                     buf->dtb_size - buf->dtb_offset);
10980                 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
10981         }
10982 }
10983
10984 static void
10985 dtrace_buffer_free(dtrace_buffer_t *bufs)
10986 {
10987         int i;
10988
10989         for (i = 0; i < (int)NCPU; i++) {
10990                 dtrace_buffer_t *buf = &bufs[i];
10991
10992                 if (buf->dtb_tomax == NULL) {
10993                         ASSERT(buf->dtb_xamot == NULL);
10994                         ASSERT(buf->dtb_size == 0);
10995                         continue;
10996                 }
10997
10998                 if (buf->dtb_xamot != NULL) {
10999                         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11000                         kmem_free(buf->dtb_xamot, buf->dtb_size);
11001
11002                         ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
11003                         dtrace_buffer_memory_inuse -= buf->dtb_size;
11004                 }
11005
11006                 kmem_free(buf->dtb_tomax, buf->dtb_size);
11007                 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
11008                 dtrace_buffer_memory_inuse -= buf->dtb_size;
11009
11010                 buf->dtb_size = 0;
11011                 buf->dtb_tomax = NULL;
11012                 buf->dtb_xamot = NULL;
11013         }
11014 }
11015
11016 /*
11017  * DTrace Enabling Functions
11018  */
11019 static dtrace_enabling_t *
11020 dtrace_enabling_create(dtrace_vstate_t *vstate)
11021 {
11022         dtrace_enabling_t *enab;
11023
11024         enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
11025         enab->dten_vstate = vstate;
11026
11027         return (enab);
11028 }
11029
11030 static void
11031 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
11032 {
11033         dtrace_ecbdesc_t **ndesc;
11034         size_t osize, nsize;
11035
11036         /*
11037          * We can't add to enablings after we've enabled them, or after we've
11038          * retained them.
11039          */
11040         ASSERT(enab->dten_probegen == 0);
11041         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11042
11043         /* APPLE NOTE: this protects against gcc 4.0 botch on x86 */
11044         if (ecb == NULL) return;
11045
11046         if (enab->dten_ndesc < enab->dten_maxdesc) {
11047                 enab->dten_desc[enab->dten_ndesc++] = ecb;
11048                 return;
11049         }
11050
11051         osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11052
11053         if (enab->dten_maxdesc == 0) {
11054                 enab->dten_maxdesc = 1;
11055         } else {
11056                 enab->dten_maxdesc <<= 1;
11057         }
11058
11059         ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
11060
11061         nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11062         ndesc = kmem_zalloc(nsize, KM_SLEEP);
11063         bcopy(enab->dten_desc, ndesc, osize);
11064         kmem_free(enab->dten_desc, osize);
11065
11066         enab->dten_desc = ndesc;
11067         enab->dten_desc[enab->dten_ndesc++] = ecb;
11068 }
11069
11070 static void
11071 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
11072     dtrace_probedesc_t *pd)
11073 {
11074         dtrace_ecbdesc_t *new;
11075         dtrace_predicate_t *pred;
11076         dtrace_actdesc_t *act;
11077
11078         /*
11079          * We're going to create a new ECB description that matches the
11080          * specified ECB in every way, but has the specified probe description.
11081          */
11082         new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11083
11084         if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11085                 dtrace_predicate_hold(pred);
11086
11087         for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11088                 dtrace_actdesc_hold(act);
11089
11090         new->dted_action = ecb->dted_action;
11091         new->dted_pred = ecb->dted_pred;
11092         new->dted_probe = *pd;
11093         new->dted_uarg = ecb->dted_uarg;
11094
11095         dtrace_enabling_add(enab, new);
11096 }
11097
11098 static void
11099 dtrace_enabling_dump(dtrace_enabling_t *enab)
11100 {
11101         int i;
11102
11103         for (i = 0; i < enab->dten_ndesc; i++) {
11104                 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11105
11106                 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11107                     desc->dtpd_provider, desc->dtpd_mod,
11108                     desc->dtpd_func, desc->dtpd_name);
11109         }
11110 }
11111
11112 static void
11113 dtrace_enabling_destroy(dtrace_enabling_t *enab)
11114 {
11115         int i;
11116         dtrace_ecbdesc_t *ep;
11117         dtrace_vstate_t *vstate = enab->dten_vstate;
11118
11119         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11120
11121         for (i = 0; i < enab->dten_ndesc; i++) {
11122                 dtrace_actdesc_t *act, *next;
11123                 dtrace_predicate_t *pred;
11124
11125                 ep = enab->dten_desc[i];
11126
11127                 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11128                         dtrace_predicate_release(pred, vstate);
11129
11130                 for (act = ep->dted_action; act != NULL; act = next) {
11131                         next = act->dtad_next;
11132                         dtrace_actdesc_release(act, vstate);
11133                 }
11134
11135                 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11136         }
11137
11138         kmem_free(enab->dten_desc,
11139             enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11140
11141         /*
11142          * If this was a retained enabling, decrement the dts_nretained count
11143          * and take it off of the dtrace_retained list.
11144          */
11145         if (enab->dten_prev != NULL || enab->dten_next != NULL ||
11146             dtrace_retained == enab) {
11147                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11148                 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
11149                 enab->dten_vstate->dtvs_state->dts_nretained--;
11150                 dtrace_retained_gen++;
11151         }
11152
11153         if (enab->dten_prev == NULL) {
11154                 if (dtrace_retained == enab) {
11155                         dtrace_retained = enab->dten_next;
11156
11157                         if (dtrace_retained != NULL)
11158                                 dtrace_retained->dten_prev = NULL;
11159                 }
11160         } else {
11161                 ASSERT(enab != dtrace_retained);
11162                 ASSERT(dtrace_retained != NULL);
11163                 enab->dten_prev->dten_next = enab->dten_next;
11164         }
11165
11166         if (enab->dten_next != NULL) {
11167                 ASSERT(dtrace_retained != NULL);
11168                 enab->dten_next->dten_prev = enab->dten_prev;
11169         }
11170
11171         kmem_free(enab, sizeof (dtrace_enabling_t));
11172 }
11173
11174 static int
11175 dtrace_enabling_retain(dtrace_enabling_t *enab)
11176 {
11177         dtrace_state_t *state;
11178
11179         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11180         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11181         ASSERT(enab->dten_vstate != NULL);
11182
11183         state = enab->dten_vstate->dtvs_state;
11184         ASSERT(state != NULL);
11185
11186         /*
11187          * We only allow each state to retain dtrace_retain_max enablings.
11188          */
11189         if (state->dts_nretained >= dtrace_retain_max)
11190                 return (ENOSPC);
11191
11192         state->dts_nretained++;
11193         dtrace_retained_gen++;
11194
11195         if (dtrace_retained == NULL) {
11196                 dtrace_retained = enab;
11197                 return (0);
11198         }
11199
11200         enab->dten_next = dtrace_retained;
11201         dtrace_retained->dten_prev = enab;
11202         dtrace_retained = enab;
11203
11204         return (0);
11205 }
11206
11207 static int
11208 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
11209     dtrace_probedesc_t *create)
11210 {
11211         dtrace_enabling_t *new, *enab;
11212         int found = 0, err = ENOENT;
11213
11214         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11215         ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11216         ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11217         ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11218         ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11219
11220         new = dtrace_enabling_create(&state->dts_vstate);
11221
11222         /*
11223          * Iterate over all retained enablings, looking for enablings that
11224          * match the specified state.
11225          */
11226         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11227                 int i;
11228
11229                 /*
11230                  * dtvs_state can only be NULL for helper enablings -- and
11231                  * helper enablings can't be retained.
11232                  */
11233                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11234
11235                 if (enab->dten_vstate->dtvs_state != state)
11236                         continue;
11237
11238                 /*
11239                  * Now iterate over each probe description; we're looking for
11240                  * an exact match to the specified probe description.
11241                  */
11242                 for (i = 0; i < enab->dten_ndesc; i++) {
11243                         dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11244                         dtrace_probedesc_t *pd = &ep->dted_probe;
11245
11246                         /* APPLE NOTE: Darwin employs size bounded string operation. */
11247                         if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
11248                                 continue;
11249
11250                         if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
11251                                 continue;
11252
11253                         if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
11254                                 continue;
11255
11256                         if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
11257                                 continue;
11258
11259                         /*
11260                          * We have a winning probe!  Add it to our growing
11261                          * enabling.
11262                          */
11263                         found = 1;
11264                         dtrace_enabling_addlike(new, ep, create);
11265                 }
11266         }
11267
11268         if (!found || (err = dtrace_enabling_retain(new)) != 0) {
11269                 dtrace_enabling_destroy(new);
11270                 return (err);
11271         }
11272
11273         return (0);
11274 }
11275
11276 static void
11277 dtrace_enabling_retract(dtrace_state_t *state)
11278 {
11279         dtrace_enabling_t *enab, *next;
11280
11281         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11282
11283         /*
11284          * Iterate over all retained enablings, destroy the enablings retained
11285          * for the specified state.
11286          */
11287         for (enab = dtrace_retained; enab != NULL; enab = next) {
11288                 next = enab->dten_next;
11289
11290                 /*
11291                  * dtvs_state can only be NULL for helper enablings -- and
11292                  * helper enablings can't be retained.
11293                  */
11294                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11295
11296                 if (enab->dten_vstate->dtvs_state == state) {
11297                         ASSERT(state->dts_nretained > 0);
11298                         dtrace_enabling_destroy(enab);
11299                 }
11300         }
11301
11302         ASSERT(state->dts_nretained == 0);
11303 }
11304
11305 static int
11306 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
11307 {
11308         int i = 0;
11309         int total_matched = 0, matched = 0;
11310
11311         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11312         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11313
11314         for (i = 0; i < enab->dten_ndesc; i++) {
11315                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11316
11317                 enab->dten_current = ep;
11318                 enab->dten_error = 0;
11319
11320                 /*
11321                  * If a provider failed to enable a probe then get out and
11322                  * let the consumer know we failed.
11323                  */
11324                 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
11325                         return (EBUSY);
11326
11327                 total_matched += matched;
11328
11329                 if (enab->dten_error != 0) {
11330                         /*
11331                          * If we get an error half-way through enabling the
11332                          * probes, we kick out -- perhaps with some number of
11333                          * them enabled.  Leaving enabled probes enabled may
11334                          * be slightly confusing for user-level, but we expect
11335                          * that no one will attempt to actually drive on in
11336                          * the face of such errors.  If this is an anonymous
11337                          * enabling (indicated with a NULL nmatched pointer),
11338                          * we cmn_err() a message.  We aren't expecting to
11339                          * get such an error -- such as it can exist at all,
11340                          * it would be a result of corrupted DOF in the driver
11341                          * properties.
11342                          */
11343                         if (nmatched == NULL) {
11344                                 cmn_err(CE_WARN, "dtrace_enabling_match() "
11345                                     "error on %p: %d", (void *)ep,
11346                                     enab->dten_error);
11347                         }
11348
11349                         return (enab->dten_error);
11350                 }
11351         }
11352
11353         enab->dten_probegen = dtrace_probegen;
11354         if (nmatched != NULL)
11355                 *nmatched = total_matched;
11356
11357         return (0);
11358 }
11359
11360 static void
11361 dtrace_enabling_matchall(void)
11362 {
11363         dtrace_enabling_t *enab;
11364
11365         lck_mtx_lock(&cpu_lock);
11366         lck_mtx_lock(&dtrace_lock);
11367
11368         /*
11369          * Iterate over all retained enablings to see if any probes match
11370          * against them.  We only perform this operation on enablings for which
11371          * we have sufficient permissions by virtue of being in the global zone
11372          * or in the same zone as the DTrace client.  Because we can be called
11373          * after dtrace_detach() has been called, we cannot assert that there
11374          * are retained enablings.  We can safely load from dtrace_retained,
11375          * however:  the taskq_destroy() at the end of dtrace_detach() will
11376          * block pending our completion.
11377          */
11378
11379         /*
11380          * Darwin doesn't do zones.
11381          * Behave as if always in "global" zone."
11382          */
11383         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11384                 (void) dtrace_enabling_match(enab, NULL);
11385         }
11386
11387         lck_mtx_unlock(&dtrace_lock);
11388         lck_mtx_unlock(&cpu_lock);
11389 }
11390
11391 /*
11392  * If an enabling is to be enabled without having matched probes (that is, if
11393  * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
11394  * enabling must be _primed_ by creating an ECB for every ECB description.
11395  * This must be done to assure that we know the number of speculations, the
11396  * number of aggregations, the minimum buffer size needed, etc. before we
11397  * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
11398  * enabling any probes, we create ECBs for every ECB decription, but with a
11399  * NULL probe -- which is exactly what this function does.
11400  */
11401 static void
11402 dtrace_enabling_prime(dtrace_state_t *state)
11403 {
11404         dtrace_enabling_t *enab;
11405         int i;
11406
11407         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11408                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11409
11410                 if (enab->dten_vstate->dtvs_state != state)
11411                         continue;
11412
11413                 /*
11414                  * We don't want to prime an enabling more than once, lest
11415                  * we allow a malicious user to induce resource exhaustion.
11416                  * (The ECBs that result from priming an enabling aren't
11417                  * leaked -- but they also aren't deallocated until the
11418                  * consumer state is destroyed.)
11419                  */
11420                 if (enab->dten_primed)
11421                         continue;
11422
11423                 for (i = 0; i < enab->dten_ndesc; i++) {
11424                         enab->dten_current = enab->dten_desc[i];
11425                         (void) dtrace_probe_enable(NULL, enab);
11426                 }
11427
11428                 enab->dten_primed = 1;
11429         }
11430 }
11431
11432 /*
11433  * Called to indicate that probes should be provided due to retained
11434  * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
11435  * must take an initial lap through the enabling calling the dtps_provide()
11436  * entry point explicitly to allow for autocreated probes.
11437  */
11438 static void
11439 dtrace_enabling_provide(dtrace_provider_t *prv)
11440 {
11441         int i, all = 0;
11442         dtrace_probedesc_t desc;
11443         dtrace_genid_t gen;
11444
11445         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11446         lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
11447
11448         if (prv == NULL) {
11449                 all = 1;
11450                 prv = dtrace_provider;
11451         }
11452
11453         do {
11454                 dtrace_enabling_t *enab;
11455                 void *parg = prv->dtpv_arg;
11456
11457 retry:
11458                 gen = dtrace_retained_gen;
11459                 for (enab = dtrace_retained; enab != NULL;
11460                     enab = enab->dten_next) {
11461                         for (i = 0; i < enab->dten_ndesc; i++) {
11462                                 desc = enab->dten_desc[i]->dted_probe;
11463                                 lck_mtx_unlock(&dtrace_lock);
11464                                 prv->dtpv_pops.dtps_provide(parg, &desc);
11465                                 lck_mtx_lock(&dtrace_lock);
11466                                 /*
11467                                  * Process the retained enablings again if
11468                                  * they have changed while we weren't holding
11469                                  * dtrace_lock.
11470                                  */
11471                                 if (gen != dtrace_retained_gen)
11472                                         goto retry;
11473                         }
11474                 }
11475         } while (all && (prv = prv->dtpv_next) != NULL);
11476
11477         lck_mtx_unlock(&dtrace_lock);
11478         dtrace_probe_provide(NULL, all ? NULL : prv);
11479         lck_mtx_lock(&dtrace_lock);
11480 }
11481
11482 /*
11483  * DTrace DOF Functions
11484  */
11485 /*ARGSUSED*/
11486 static void
11487 dtrace_dof_error(dof_hdr_t *dof, const char *str)
11488 {
11489 #pragma unused(dof) /* __APPLE__ */
11490         if (dtrace_err_verbose)
11491                 cmn_err(CE_WARN, "failed to process DOF: %s", str);
11492
11493 #ifdef DTRACE_ERRDEBUG
11494         dtrace_errdebug(str);
11495 #endif
11496 }
11497
11498 /*
11499  * Create DOF out of a currently enabled state.  Right now, we only create
11500  * DOF containing the run-time options -- but this could be expanded to create
11501  * complete DOF representing the enabled state.
11502  */
11503 static dof_hdr_t *
11504 dtrace_dof_create(dtrace_state_t *state)
11505 {
11506         dof_hdr_t *dof;
11507         dof_sec_t *sec;
11508         dof_optdesc_t *opt;
11509         int i, len = sizeof (dof_hdr_t) +
11510             roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
11511             sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11512
11513         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11514
11515         dof = dt_kmem_zalloc_aligned(len, 8, KM_SLEEP);
11516         dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
11517         dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
11518         dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
11519         dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
11520
11521         dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
11522         dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
11523         dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
11524         dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
11525         dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
11526         dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
11527
11528         dof->dofh_flags = 0;
11529         dof->dofh_hdrsize = sizeof (dof_hdr_t);
11530         dof->dofh_secsize = sizeof (dof_sec_t);
11531         dof->dofh_secnum = 1;   /* only DOF_SECT_OPTDESC */
11532         dof->dofh_secoff = sizeof (dof_hdr_t);
11533         dof->dofh_loadsz = len;
11534         dof->dofh_filesz = len;
11535         dof->dofh_pad = 0;
11536
11537         /*
11538          * Fill in the option section header...
11539          */
11540         sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
11541         sec->dofs_type = DOF_SECT_OPTDESC;
11542         sec->dofs_align = sizeof (uint64_t);
11543         sec->dofs_flags = DOF_SECF_LOAD;
11544         sec->dofs_entsize = sizeof (dof_optdesc_t);
11545
11546         opt = (dof_optdesc_t *)((uintptr_t)sec +
11547             roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
11548
11549         sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
11550         sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11551
11552         for (i = 0; i < DTRACEOPT_MAX; i++) {
11553                 opt[i].dofo_option = i;
11554                 opt[i].dofo_strtab = DOF_SECIDX_NONE;
11555                 opt[i].dofo_value = state->dts_options[i];
11556         }
11557
11558         return (dof);
11559 }
11560
11561 static dof_hdr_t *
11562 dtrace_dof_copyin(user_addr_t uarg, int *errp)
11563 {
11564         dof_hdr_t hdr, *dof;
11565
11566         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
11567
11568         /*
11569          * First, we're going to copyin() the sizeof (dof_hdr_t).
11570          */
11571         if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
11572                 dtrace_dof_error(NULL, "failed to copyin DOF header");
11573                 *errp = EFAULT;
11574                 return (NULL);
11575         }
11576
11577         /*
11578          * Now we'll allocate the entire DOF and copy it in -- provided
11579          * that the length isn't outrageous.
11580          */
11581         if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
11582                 dtrace_dof_error(&hdr, "load size exceeds maximum");
11583                 *errp = E2BIG;
11584                 return (NULL);
11585         }
11586
11587         if (hdr.dofh_loadsz < sizeof (hdr)) {
11588                 dtrace_dof_error(&hdr, "invalid load size");
11589                 *errp = EINVAL;
11590                 return (NULL);
11591         }
11592
11593         dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
11594
11595         if (copyin(uarg, dof, hdr.dofh_loadsz) != 0  ||
11596           dof->dofh_loadsz != hdr.dofh_loadsz) {
11597             dt_kmem_free_aligned(dof, hdr.dofh_loadsz);
11598             *errp = EFAULT;
11599             return (NULL);
11600         }
11601
11602         return (dof);
11603 }
11604
11605 static dof_hdr_t *
11606 dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
11607 {
11608         dof_hdr_t hdr, *dof;
11609
11610         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
11611
11612         /*
11613          * First, we're going to copyin() the sizeof (dof_hdr_t).
11614          */
11615         if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
11616                 dtrace_dof_error(NULL, "failed to copyin DOF header");
11617                 *errp = EFAULT;
11618                 return (NULL);
11619         }
11620
11621         /*
11622          * Now we'll allocate the entire DOF and copy it in -- provided
11623          * that the length isn't outrageous.
11624          */
11625         if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
11626                 dtrace_dof_error(&hdr, "load size exceeds maximum");
11627                 *errp = E2BIG;
11628                 return (NULL);
11629         }
11630
11631         if (hdr.dofh_loadsz < sizeof (hdr)) {
11632                 dtrace_dof_error(&hdr, "invalid load size");
11633                 *errp = EINVAL;
11634                 return (NULL);
11635         }
11636
11637         dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
11638
11639         if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
11640                 dt_kmem_free_aligned(dof, hdr.dofh_loadsz);
11641                 *errp = EFAULT;
11642                 return (NULL);
11643         }
11644
11645         return (dof);
11646 }
11647
11648 static dof_hdr_t *
11649 dtrace_dof_property(const char *name)
11650 {
11651         uchar_t *buf;
11652         uint64_t loadsz;
11653         unsigned int len, i;
11654         dof_hdr_t *dof;
11655
11656         /*
11657          * Unfortunately, array of values in .conf files are always (and
11658          * only) interpreted to be integer arrays.  We must read our DOF
11659          * as an integer array, and then squeeze it into a byte array.
11660          */
11661         if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
11662             name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
11663                 return (NULL);
11664
11665         for (i = 0; i < len; i++)
11666                 buf[i] = (uchar_t)(((int *)buf)[i]);
11667
11668         if (len < sizeof (dof_hdr_t)) {
11669                 ddi_prop_free(buf);
11670                 dtrace_dof_error(NULL, "truncated header");
11671                 return (NULL);
11672         }
11673
11674         if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
11675                 ddi_prop_free(buf);
11676                 dtrace_dof_error(NULL, "truncated DOF");
11677                 return (NULL);
11678         }
11679
11680         if (loadsz >= (uint64_t)dtrace_dof_maxsize) {
11681                 ddi_prop_free(buf);
11682                 dtrace_dof_error(NULL, "oversized DOF");
11683                 return (NULL);
11684         }
11685
11686         dof = dt_kmem_alloc_aligned(loadsz, 8, KM_SLEEP);
11687         bcopy(buf, dof, loadsz);
11688         ddi_prop_free(buf);
11689
11690         return (dof);
11691 }
11692
11693 static void
11694 dtrace_dof_destroy(dof_hdr_t *dof)
11695 {
11696         dt_kmem_free_aligned(dof, dof->dofh_loadsz);
11697 }
11698
11699 /*
11700  * Return the dof_sec_t pointer corresponding to a given section index.  If the
11701  * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
11702  * a type other than DOF_SECT_NONE is specified, the header is checked against
11703  * this type and NULL is returned if the types do not match.
11704  */
11705 static dof_sec_t *
11706 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
11707 {
11708         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
11709             ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
11710
11711         if (i >= dof->dofh_secnum) {
11712                 dtrace_dof_error(dof, "referenced section index is invalid");
11713                 return (NULL);
11714         }
11715
11716         if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
11717                 dtrace_dof_error(dof, "referenced section is not loadable");
11718                 return (NULL);
11719         }
11720
11721         if (type != DOF_SECT_NONE && type != sec->dofs_type) {
11722                 dtrace_dof_error(dof, "referenced section is the wrong type");
11723                 return (NULL);
11724         }
11725
11726         return (sec);
11727 }
11728
11729 static dtrace_probedesc_t *
11730 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
11731 {
11732         dof_probedesc_t *probe;
11733         dof_sec_t *strtab;
11734         uintptr_t daddr = (uintptr_t)dof;
11735         uintptr_t str;
11736         size_t size;
11737
11738         if (sec->dofs_type != DOF_SECT_PROBEDESC) {
11739                 dtrace_dof_error(dof, "invalid probe section");
11740                 return (NULL);
11741         }
11742
11743         if (sec->dofs_align != sizeof (dof_secidx_t)) {
11744                 dtrace_dof_error(dof, "bad alignment in probe description");
11745                 return (NULL);
11746         }
11747
11748         if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
11749                 dtrace_dof_error(dof, "truncated probe description");
11750                 return (NULL);
11751         }
11752
11753         probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
11754         strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
11755
11756         if (strtab == NULL)
11757                 return (NULL);
11758
11759         str = daddr + strtab->dofs_offset;
11760         size = strtab->dofs_size;
11761
11762         if (probe->dofp_provider >= strtab->dofs_size) {
11763                 dtrace_dof_error(dof, "corrupt probe provider");
11764                 return (NULL);
11765         }
11766
11767         (void) strncpy(desc->dtpd_provider,
11768             (char *)(str + probe->dofp_provider),
11769             MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
11770
11771         /* APPLE NOTE: Darwin employs size bounded string operation. */
11772         desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
11773
11774         if (probe->dofp_mod >= strtab->dofs_size) {
11775                 dtrace_dof_error(dof, "corrupt probe module");
11776                 return (NULL);
11777         }
11778
11779         (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
11780             MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
11781
11782         /* APPLE NOTE: Darwin employs size bounded string operation. */
11783         desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
11784
11785         if (probe->dofp_func >= strtab->dofs_size) {
11786                 dtrace_dof_error(dof, "corrupt probe function");
11787                 return (NULL);
11788         }
11789
11790         (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
11791             MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
11792
11793         /* APPLE NOTE: Darwin employs size bounded string operation. */
11794         desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
11795
11796         if (probe->dofp_name >= strtab->dofs_size) {
11797                 dtrace_dof_error(dof, "corrupt probe name");
11798                 return (NULL);
11799         }
11800
11801         (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
11802             MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
11803
11804         /* APPLE NOTE: Darwin employs size bounded string operation. */
11805         desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
11806
11807         return (desc);
11808 }
11809
11810 static dtrace_difo_t *
11811 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11812     cred_t *cr)
11813 {
11814         dtrace_difo_t *dp;
11815         size_t ttl = 0;
11816         dof_difohdr_t *dofd;
11817         uintptr_t daddr = (uintptr_t)dof;
11818         size_t max_size = dtrace_difo_maxsize;
11819         uint_t i;
11820         int l, n;
11821
11822
11823         static const struct {
11824                 int section;
11825                 int bufoffs;
11826                 int lenoffs;
11827                 int entsize;
11828                 int align;
11829                 const char *msg;
11830         } difo[] = {
11831                 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
11832                 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
11833                 sizeof (dif_instr_t), "multiple DIF sections" },
11834
11835                 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
11836                 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
11837                 sizeof (uint64_t), "multiple integer tables" },
11838
11839                 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
11840                 offsetof(dtrace_difo_t, dtdo_strlen), 0,
11841                 sizeof (char), "multiple string tables" },
11842
11843                 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
11844                 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
11845                 sizeof (uint_t), "multiple variable tables" },
11846
11847                 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
11848         };
11849
11850         if (sec->dofs_type != DOF_SECT_DIFOHDR) {
11851                 dtrace_dof_error(dof, "invalid DIFO header section");
11852                 return (NULL);
11853         }
11854
11855         if (sec->dofs_align != sizeof (dof_secidx_t)) {
11856                 dtrace_dof_error(dof, "bad alignment in DIFO header");
11857                 return (NULL);
11858         }
11859
11860         if (sec->dofs_size < sizeof (dof_difohdr_t) ||
11861             sec->dofs_size % sizeof (dof_secidx_t)) {
11862                 dtrace_dof_error(dof, "bad size in DIFO header");
11863                 return (NULL);
11864         }
11865
11866         dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11867         n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
11868
11869         dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
11870         dp->dtdo_rtype = dofd->dofd_rtype;
11871
11872         for (l = 0; l < n; l++) {
11873                 dof_sec_t *subsec;
11874                 void **bufp;
11875                 uint32_t *lenp;
11876
11877                 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
11878                     dofd->dofd_links[l])) == NULL)
11879                         goto err; /* invalid section link */
11880
11881                 if (ttl + subsec->dofs_size > max_size) {
11882                         dtrace_dof_error(dof, "exceeds maximum size");
11883                         goto err;
11884                 }
11885
11886                 ttl += subsec->dofs_size;
11887
11888                 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
11889
11890                         if (subsec->dofs_type != (uint32_t)difo[i].section)
11891                                 continue;
11892
11893                         if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
11894                                 dtrace_dof_error(dof, "section not loaded");
11895                                 goto err;
11896                         }
11897
11898                         if (subsec->dofs_align != (uint32_t)difo[i].align) {
11899                                 dtrace_dof_error(dof, "bad alignment");
11900                                 goto err;
11901                         }
11902
11903                         bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
11904                         lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
11905
11906                         if (*bufp != NULL) {
11907                                 dtrace_dof_error(dof, difo[i].msg);
11908                                 goto err;
11909                         }
11910
11911                         if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
11912                                 dtrace_dof_error(dof, "entry size mismatch");
11913                                 goto err;
11914                         }
11915
11916                         if (subsec->dofs_entsize != 0 &&
11917                             (subsec->dofs_size % subsec->dofs_entsize) != 0) {
11918                                 dtrace_dof_error(dof, "corrupt entry size");
11919                                 goto err;
11920                         }
11921
11922                         *lenp = subsec->dofs_size;
11923                         *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
11924                         bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
11925                             *bufp, subsec->dofs_size);
11926
11927                         if (subsec->dofs_entsize != 0)
11928                                 *lenp /= subsec->dofs_entsize;
11929
11930                         break;
11931                 }
11932
11933                 /*
11934                  * If we encounter a loadable DIFO sub-section that is not
11935                  * known to us, assume this is a broken program and fail.
11936                  */
11937                 if (difo[i].section == DOF_SECT_NONE &&
11938                     (subsec->dofs_flags & DOF_SECF_LOAD)) {
11939                         dtrace_dof_error(dof, "unrecognized DIFO subsection");
11940                         goto err;
11941                 }
11942         }
11943
11944         if (dp->dtdo_buf == NULL) {
11945                 /*
11946                  * We can't have a DIF object without DIF text.
11947                  */
11948                 dtrace_dof_error(dof, "missing DIF text");
11949                 goto err;
11950         }
11951
11952         /*
11953          * Before we validate the DIF object, run through the variable table
11954          * looking for the strings -- if any of their size are under, we'll set
11955          * their size to be the system-wide default string size.  Note that
11956          * this should _not_ happen if the "strsize" option has been set --
11957          * in this case, the compiler should have set the size to reflect the
11958          * setting of the option.
11959          */
11960         for (i = 0; i < dp->dtdo_varlen; i++) {
11961                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
11962                 dtrace_diftype_t *t = &v->dtdv_type;
11963
11964                 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
11965                         continue;
11966
11967                 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
11968                         t->dtdt_size = dtrace_strsize_default;
11969         }
11970
11971         if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
11972                 goto err;
11973
11974         dtrace_difo_init(dp, vstate);
11975         return (dp);
11976
11977 err:
11978         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
11979         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
11980         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
11981         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
11982
11983         kmem_free(dp, sizeof (dtrace_difo_t));
11984         return (NULL);
11985 }
11986
11987 static dtrace_predicate_t *
11988 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11989     cred_t *cr)
11990 {
11991         dtrace_difo_t *dp;
11992
11993         if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
11994                 return (NULL);
11995
11996         return (dtrace_predicate_create(dp));
11997 }
11998
11999 static dtrace_actdesc_t *
12000 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12001     cred_t *cr)
12002 {
12003         dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
12004         dof_actdesc_t *desc;
12005         dof_sec_t *difosec;
12006         size_t offs;
12007         uintptr_t daddr = (uintptr_t)dof;
12008         uint64_t arg;
12009         dtrace_actkind_t kind;
12010
12011         if (sec->dofs_type != DOF_SECT_ACTDESC) {
12012                 dtrace_dof_error(dof, "invalid action section");
12013                 return (NULL);
12014         }
12015
12016         if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
12017                 dtrace_dof_error(dof, "truncated action description");
12018                 return (NULL);
12019         }
12020
12021         if (sec->dofs_align != sizeof (uint64_t)) {
12022                 dtrace_dof_error(dof, "bad alignment in action description");
12023                 return (NULL);
12024         }
12025
12026         if (sec->dofs_size < sec->dofs_entsize) {
12027                 dtrace_dof_error(dof, "section entry size exceeds total size");
12028                 return (NULL);
12029         }
12030
12031         if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
12032                 dtrace_dof_error(dof, "bad entry size in action description");
12033                 return (NULL);
12034         }
12035
12036         if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
12037                 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
12038                 return (NULL);
12039         }
12040
12041         for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
12042                 desc = (dof_actdesc_t *)(daddr +
12043                     (uintptr_t)sec->dofs_offset + offs);
12044                 kind = (dtrace_actkind_t)desc->dofa_kind;
12045
12046                 if (DTRACEACT_ISPRINTFLIKE(kind) &&
12047                     (kind != DTRACEACT_PRINTA ||
12048                     desc->dofa_strtab != DOF_SECIDX_NONE)) {
12049                         dof_sec_t *strtab;
12050                         char *str, *fmt;
12051                         uint64_t i;
12052
12053                         /*
12054                          * printf()-like actions must have a format string.
12055                          */
12056                         if ((strtab = dtrace_dof_sect(dof,
12057                             DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
12058                                 goto err;
12059
12060                         str = (char *)((uintptr_t)dof +
12061                             (uintptr_t)strtab->dofs_offset);
12062
12063                         for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
12064                                 if (str[i] == '\0')
12065                                         break;
12066                         }
12067
12068                         if (i >= strtab->dofs_size) {
12069                                 dtrace_dof_error(dof, "bogus format string");
12070                                 goto err;
12071                         }
12072
12073                         if (i == desc->dofa_arg) {
12074                                 dtrace_dof_error(dof, "empty format string");
12075                                 goto err;
12076                         }
12077
12078                         i -= desc->dofa_arg;
12079                         fmt = kmem_alloc(i + 1, KM_SLEEP);
12080                         bcopy(&str[desc->dofa_arg], fmt, i + 1);
12081                         arg = (uint64_t)(uintptr_t)fmt;
12082                 } else {
12083                         if (kind == DTRACEACT_PRINTA) {
12084                                 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
12085                                 arg = 0;
12086                         } else {
12087                                 arg = desc->dofa_arg;
12088                         }
12089                 }
12090
12091                 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
12092                     desc->dofa_uarg, arg);
12093
12094                 if (last != NULL) {
12095                         last->dtad_next = act;
12096                 } else {
12097                         first = act;
12098                 }
12099
12100                 last = act;
12101
12102                 if (desc->dofa_difo == DOF_SECIDX_NONE)
12103                         continue;
12104
12105                 if ((difosec = dtrace_dof_sect(dof,
12106                     DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
12107                         goto err;
12108
12109                 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
12110
12111                 if (act->dtad_difo == NULL)
12112                         goto err;
12113         }
12114
12115         ASSERT(first != NULL);
12116         return (first);
12117
12118 err:
12119         for (act = first; act != NULL; act = next) {
12120                 next = act->dtad_next;
12121                 dtrace_actdesc_release(act, vstate);
12122         }
12123
12124         return (NULL);
12125 }
12126
12127 static dtrace_ecbdesc_t *
12128 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12129     cred_t *cr)
12130 {
12131         dtrace_ecbdesc_t *ep;
12132         dof_ecbdesc_t *ecb;
12133         dtrace_probedesc_t *desc;
12134         dtrace_predicate_t *pred = NULL;
12135
12136         if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
12137                 dtrace_dof_error(dof, "truncated ECB description");
12138                 return (NULL);
12139         }
12140
12141         if (sec->dofs_align != sizeof (uint64_t)) {
12142                 dtrace_dof_error(dof, "bad alignment in ECB description");
12143                 return (NULL);
12144         }
12145
12146         ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
12147         sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
12148
12149         if (sec == NULL)
12150                 return (NULL);
12151
12152         ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12153         ep->dted_uarg = ecb->dofe_uarg;
12154         desc = &ep->dted_probe;
12155
12156         if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
12157                 goto err;
12158
12159         if (ecb->dofe_pred != DOF_SECIDX_NONE) {
12160                 if ((sec = dtrace_dof_sect(dof,
12161                     DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
12162                         goto err;
12163
12164                 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
12165                         goto err;
12166
12167                 ep->dted_pred.dtpdd_predicate = pred;
12168         }
12169
12170         if (ecb->dofe_actions != DOF_SECIDX_NONE) {
12171                 if ((sec = dtrace_dof_sect(dof,
12172                     DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
12173                         goto err;
12174
12175                 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
12176
12177                 if (ep->dted_action == NULL)
12178                         goto err;
12179         }
12180
12181         return (ep);
12182
12183 err:
12184         if (pred != NULL)
12185                 dtrace_predicate_release(pred, vstate);
12186         kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12187         return (NULL);
12188 }
12189
12190 /*
12191  * APPLE NOTE: dyld handles dof relocation.
12192  * Darwin does not need dtrace_dof_relocate()
12193  */
12194
12195 /*
12196  * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12197  * header:  it should be at the front of a memory region that is at least
12198  * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12199  * size.  It need not be validated in any other way.
12200  */
12201 static int
12202 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12203     dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12204 {
12205 #pragma unused(ubase) /* __APPLE__ */
12206         uint64_t len = dof->dofh_loadsz, seclen;
12207         uintptr_t daddr = (uintptr_t)dof;
12208         dtrace_ecbdesc_t *ep;
12209         dtrace_enabling_t *enab;
12210         uint_t i;
12211
12212         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12213         ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12214
12215         /*
12216          * Check the DOF header identification bytes.  In addition to checking
12217          * valid settings, we also verify that unused bits/bytes are zeroed so
12218          * we can use them later without fear of regressing existing binaries.
12219          */
12220         if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12221             DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12222                 dtrace_dof_error(dof, "DOF magic string mismatch");
12223                 return (-1);
12224         }
12225
12226         if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12227             dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12228                 dtrace_dof_error(dof, "DOF has invalid data model");
12229                 return (-1);
12230         }
12231
12232         if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12233                 dtrace_dof_error(dof, "DOF encoding mismatch");
12234                 return (-1);
12235         }
12236
12237         /*
12238          * APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
12239          */
12240         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
12241                 dtrace_dof_error(dof, "DOF version mismatch");
12242                 return (-1);
12243         }
12244
12245         if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12246                 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12247                 return (-1);
12248         }
12249
12250         if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12251                 dtrace_dof_error(dof, "DOF uses too many integer registers");
12252                 return (-1);
12253         }
12254
12255         if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12256                 dtrace_dof_error(dof, "DOF uses too many tuple registers");
12257                 return (-1);
12258         }
12259
12260         for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12261                 if (dof->dofh_ident[i] != 0) {
12262                         dtrace_dof_error(dof, "DOF has invalid ident byte set");
12263                         return (-1);
12264                 }
12265         }
12266
12267         if (dof->dofh_flags & ~DOF_FL_VALID) {
12268                 dtrace_dof_error(dof, "DOF has invalid flag bits set");
12269                 return (-1);
12270         }
12271
12272         if (dof->dofh_secsize == 0) {
12273                 dtrace_dof_error(dof, "zero section header size");
12274                 return (-1);
12275         }
12276
12277         /*
12278          * Check that the section headers don't exceed the amount of DOF
12279          * data.  Note that we cast the section size and number of sections
12280          * to uint64_t's to prevent possible overflow in the multiplication.
12281          */
12282         seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12283
12284         if (dof->dofh_secoff > len || seclen > len ||
12285             dof->dofh_secoff + seclen > len) {
12286                 dtrace_dof_error(dof, "truncated section headers");
12287                 return (-1);
12288         }
12289
12290         if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12291                 dtrace_dof_error(dof, "misaligned section headers");
12292                 return (-1);
12293         }
12294
12295         if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12296                 dtrace_dof_error(dof, "misaligned section size");
12297                 return (-1);
12298         }
12299
12300         /*
12301          * Take an initial pass through the section headers to be sure that
12302          * the headers don't have stray offsets.  If the 'noprobes' flag is
12303          * set, do not permit sections relating to providers, probes, or args.
12304          */
12305         for (i = 0; i < dof->dofh_secnum; i++) {
12306                 dof_sec_t *sec = (dof_sec_t *)(daddr +
12307                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12308
12309                 if (noprobes) {
12310                         switch (sec->dofs_type) {
12311                         case DOF_SECT_PROVIDER:
12312                         case DOF_SECT_PROBES:
12313                         case DOF_SECT_PRARGS:
12314                         case DOF_SECT_PROFFS:
12315                                 dtrace_dof_error(dof, "illegal sections "
12316                                     "for enabling");
12317                                 return (-1);
12318                         }
12319                 }
12320
12321                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12322                         continue; /* just ignore non-loadable sections */
12323
12324                 if (sec->dofs_align & (sec->dofs_align - 1)) {
12325                         dtrace_dof_error(dof, "bad section alignment");
12326                         return (-1);
12327                 }
12328
12329                 if (sec->dofs_offset & (sec->dofs_align - 1)) {
12330                         dtrace_dof_error(dof, "misaligned section");
12331                         return (-1);
12332                 }
12333
12334                 if (sec->dofs_offset > len || sec->dofs_size > len ||
12335                     sec->dofs_offset + sec->dofs_size > len) {
12336                         dtrace_dof_error(dof, "corrupt section header");
12337                         return (-1);
12338                 }
12339
12340                 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
12341                     sec->dofs_offset + sec->dofs_size - 1) != '\0') {
12342                         dtrace_dof_error(dof, "non-terminating string table");
12343                         return (-1);
12344                 }
12345         }
12346
12347         /*
12348          * APPLE NOTE: We have no further relocation to perform.
12349          * All dof values are relative offsets.
12350          */
12351
12352         if ((enab = *enabp) == NULL)
12353                 enab = *enabp = dtrace_enabling_create(vstate);
12354
12355         for (i = 0; i < dof->dofh_secnum; i++) {
12356                 dof_sec_t *sec = (dof_sec_t *)(daddr +
12357                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12358
12359                 if (sec->dofs_type != DOF_SECT_ECBDESC)
12360                         continue;
12361
12362                 /*
12363                  * APPLE NOTE: Defend against gcc 4.0 botch on x86.
12364                  * not all paths out of inlined dtrace_dof_ecbdesc
12365                  * are checked for the NULL return value.
12366                  * Check for NULL explicitly here.
12367                 */
12368                 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
12369                 if (ep == NULL) {
12370                         dtrace_enabling_destroy(enab);
12371                         *enabp = NULL;
12372                         return (-1);
12373                 }
12374
12375                 dtrace_enabling_add(enab, ep);
12376         }
12377
12378         return (0);
12379 }
12380
12381 /*
12382  * Process DOF for any options.  This routine assumes that the DOF has been
12383  * at least processed by dtrace_dof_slurp().
12384  */
12385 static int
12386 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
12387 {
12388         uint_t i;
12389         int rval;
12390         uint32_t entsize;
12391         size_t offs;
12392         dof_optdesc_t *desc;
12393
12394         for (i = 0; i < dof->dofh_secnum; i++) {
12395                 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
12396                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12397
12398                 if (sec->dofs_type != DOF_SECT_OPTDESC)
12399                         continue;
12400
12401                 if (sec->dofs_align != sizeof (uint64_t)) {
12402                         dtrace_dof_error(dof, "bad alignment in "
12403                             "option description");
12404                         return (EINVAL);
12405                 }
12406
12407                 if ((entsize = sec->dofs_entsize) == 0) {
12408                         dtrace_dof_error(dof, "zeroed option entry size");
12409                         return (EINVAL);
12410                 }
12411
12412                 if (entsize < sizeof (dof_optdesc_t)) {
12413                         dtrace_dof_error(dof, "bad option entry size");
12414                         return (EINVAL);
12415                 }
12416
12417                 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
12418                         desc = (dof_optdesc_t *)((uintptr_t)dof +
12419                             (uintptr_t)sec->dofs_offset + offs);
12420
12421                         if (desc->dofo_strtab != DOF_SECIDX_NONE) {
12422                                 dtrace_dof_error(dof, "non-zero option string");
12423                                 return (EINVAL);
12424                         }
12425
12426                         if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
12427                                 dtrace_dof_error(dof, "unset option");
12428                                 return (EINVAL);
12429                         }
12430
12431                         if ((rval = dtrace_state_option(state,
12432                             desc->dofo_option, desc->dofo_value)) != 0) {
12433                                 dtrace_dof_error(dof, "rejected option");
12434                                 return (rval);
12435                         }
12436                 }
12437         }
12438
12439         return (0);
12440 }
12441
12442 /*
12443  * DTrace Consumer State Functions
12444  */
12445 static int
12446 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
12447 {
12448         size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
12449         void *base;
12450         uintptr_t limit;
12451         dtrace_dynvar_t *dvar, *next, *start;
12452         size_t i;
12453
12454         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12455         ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
12456
12457         bzero(dstate, sizeof (dtrace_dstate_t));
12458
12459         if ((dstate->dtds_chunksize = chunksize) == 0)
12460                 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
12461
12462         if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
12463                 size = min_size;
12464
12465         if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12466                 return (ENOMEM);
12467
12468         dstate->dtds_size = size;
12469         dstate->dtds_base = base;
12470         dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
12471         bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
12472
12473         hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
12474
12475         if (hashsize != 1 && (hashsize & 1))
12476                 hashsize--;
12477
12478         dstate->dtds_hashsize = hashsize;
12479         dstate->dtds_hash = dstate->dtds_base;
12480
12481         /*
12482          * Set all of our hash buckets to point to the single sink, and (if
12483          * it hasn't already been set), set the sink's hash value to be the
12484          * sink sentinel value.  The sink is needed for dynamic variable
12485          * lookups to know that they have iterated over an entire, valid hash
12486          * chain.
12487          */
12488         for (i = 0; i < hashsize; i++)
12489                 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
12490
12491         if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
12492                 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
12493
12494         /*
12495          * Determine number of active CPUs.  Divide free list evenly among
12496          * active CPUs.
12497          */
12498         start = (dtrace_dynvar_t *)
12499             ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
12500         limit = (uintptr_t)base + size;
12501
12502         maxper = (limit - (uintptr_t)start) / (int)NCPU;
12503         maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
12504
12505         for (i = 0; i < NCPU; i++) {
12506                 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
12507
12508                 /*
12509                  * If we don't even have enough chunks to make it once through
12510                  * NCPUs, we're just going to allocate everything to the first
12511                  * CPU.  And if we're on the last CPU, we're going to allocate
12512                  * whatever is left over.  In either case, we set the limit to
12513                  * be the limit of the dynamic variable space.
12514                  */
12515                 if (maxper == 0 || i == NCPU - 1) {
12516                         limit = (uintptr_t)base + size;
12517                         start = NULL;
12518                 } else {
12519                         limit = (uintptr_t)start + maxper;
12520                         start = (dtrace_dynvar_t *)limit;
12521                 }
12522
12523                 ASSERT(limit <= (uintptr_t)base + size);
12524
12525                 for (;;) {
12526                         next = (dtrace_dynvar_t *)((uintptr_t)dvar +
12527                             dstate->dtds_chunksize);
12528
12529                         if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
12530                                 break;
12531
12532                         dvar->dtdv_next = next;
12533                         dvar = next;
12534                 }
12535
12536                 if (maxper == 0)
12537                         break;
12538         }
12539
12540         return (0);
12541 }
12542
12543 static void
12544 dtrace_dstate_fini(dtrace_dstate_t *dstate)
12545 {
12546         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12547
12548         if (dstate->dtds_base == NULL)
12549                 return;
12550
12551         kmem_free(dstate->dtds_base, dstate->dtds_size);
12552         kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
12553 }
12554
12555 static void
12556 dtrace_vstate_fini(dtrace_vstate_t *vstate)
12557 {
12558         /*
12559          * Logical XOR, where are you?
12560          */
12561         ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
12562
12563         if (vstate->dtvs_nglobals > 0) {
12564                 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
12565                     sizeof (dtrace_statvar_t *));
12566         }
12567
12568         if (vstate->dtvs_ntlocals > 0) {
12569                 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
12570                     sizeof (dtrace_difv_t));
12571         }
12572
12573         ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
12574
12575         if (vstate->dtvs_nlocals > 0) {
12576                 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
12577                     sizeof (dtrace_statvar_t *));
12578         }
12579 }
12580
12581 static void
12582 dtrace_state_clean(dtrace_state_t *state)
12583 {
12584         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
12585                 return;
12586
12587         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
12588         dtrace_speculation_clean(state);
12589 }
12590
12591 static void
12592 dtrace_state_deadman(dtrace_state_t *state)
12593 {
12594         hrtime_t now;
12595
12596         dtrace_sync();
12597
12598         now = dtrace_gethrtime();
12599
12600         if (state != dtrace_anon.dta_state &&
12601             now - state->dts_laststatus >= dtrace_deadman_user)
12602                 return;
12603
12604         /*
12605          * We must be sure that dts_alive never appears to be less than the
12606          * value upon entry to dtrace_state_deadman(), and because we lack a
12607          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
12608          * store INT64_MAX to it, followed by a memory barrier, followed by
12609          * the new value.  This assures that dts_alive never appears to be
12610          * less than its true value, regardless of the order in which the
12611          * stores to the underlying storage are issued.
12612          */
12613         state->dts_alive = INT64_MAX;
12614         dtrace_membar_producer();
12615         state->dts_alive = now;
12616 }
12617
12618 static int
12619 dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
12620 {
12621         minor_t minor;
12622         major_t major;
12623         char c[30];
12624         dtrace_state_t *state;
12625         dtrace_optval_t *opt;
12626         int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
12627
12628         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12629         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12630
12631         /* Cause restart */
12632         *new_state = NULL;
12633
12634         /*
12635          * Darwin's DEVFS layer acquired the minor number for this "device" when it called
12636          * dtrace_devfs_clone_func(). At that time, dtrace_devfs_clone_func() proposed a minor number
12637          * (next unused according to vmem_alloc()) and then immediately put the number back in play
12638          * (by calling vmem_free()). Now that minor number is being used for an open, so committing it
12639          * to use. The following vmem_alloc() must deliver that same minor number. FIXME.
12640          */
12641
12642         minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
12643             VM_BESTFIT | VM_SLEEP);
12644
12645         if (NULL != devp) {
12646         ASSERT(getminor(*devp) == minor);
12647                 if (getminor(*devp) != minor) {
12648                         printf("dtrace_open: couldn't re-acquire vended minor number %d. Instead got %d\n",
12649                                         getminor(*devp), minor);
12650                         vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12651                         return (ERESTART);      /* can't reacquire */
12652                 }
12653         } else {
12654         /* NULL==devp iff "Anonymous state" (see dtrace_anon_property),
12655                  * so just vend the minor device number here de novo since no "open" has occurred. */
12656         }
12657
12658         if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
12659                 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12660                 return (EAGAIN);        /* temporary resource shortage */
12661         }
12662
12663         state = ddi_get_soft_state(dtrace_softstate, minor);
12664         state->dts_epid = DTRACE_EPIDNONE + 1;
12665
12666         (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
12667         state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
12668             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12669
12670         if (devp != NULL) {
12671                 major = getemajor(*devp);
12672         } else {
12673                 major = ddi_driver_major(dtrace_devi);
12674         }
12675
12676         state->dts_dev = makedevice(major, minor);
12677
12678         if (devp != NULL)
12679                 *devp = state->dts_dev;
12680
12681         /*
12682          * We allocate NCPU buffers.  On the one hand, this can be quite
12683          * a bit of memory per instance (nearly 36K on a Starcat).  On the
12684          * other hand, it saves an additional memory reference in the probe
12685          * path.
12686          */
12687         state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
12688         state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
12689         state->dts_cleaner = CYCLIC_NONE;
12690         state->dts_deadman = CYCLIC_NONE;
12691         state->dts_vstate.dtvs_state = state;
12692
12693         for (i = 0; i < DTRACEOPT_MAX; i++)
12694                 state->dts_options[i] = DTRACEOPT_UNSET;
12695
12696         /*
12697          * Set the default options.
12698          */
12699         opt = state->dts_options;
12700         opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
12701         opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
12702         opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
12703         opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
12704         opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
12705         opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
12706         opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
12707         opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
12708         opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
12709         opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
12710         opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
12711         opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
12712         opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
12713         opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
12714
12715         state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
12716
12717         /*
12718          * Depending on the user credentials, we set flag bits which alter probe
12719          * visibility or the amount of destructiveness allowed.  In the case of
12720          * actual anonymous tracing, or the possession of all privileges, all of
12721          * the normal checks are bypassed.
12722          */
12723         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
12724                 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
12725                 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
12726         } else {
12727                 /*
12728                  * Set up the credentials for this instantiation.  We take a
12729                  * hold on the credential to prevent it from disappearing on
12730                  * us; this in turn prevents the zone_t referenced by this
12731                  * credential from disappearing.  This means that we can
12732                  * examine the credential and the zone from probe context.
12733                  */
12734                 crhold(cr);
12735                 state->dts_cred.dcr_cred = cr;
12736
12737                 /*
12738                  * CRA_PROC means "we have *some* privilege for dtrace" and
12739                  * unlocks the use of variables like pid, zonename, etc.
12740                  */
12741                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
12742                     PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12743                         state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
12744                 }
12745
12746                 /*
12747                  * dtrace_user allows use of syscall and profile providers.
12748                  * If the user also has proc_owner and/or proc_zone, we
12749                  * extend the scope to include additional visibility and
12750                  * destructive power.
12751                  */
12752                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
12753                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
12754                                 state->dts_cred.dcr_visible |=
12755                                     DTRACE_CRV_ALLPROC;
12756
12757                                 state->dts_cred.dcr_action |=
12758                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12759                         }
12760
12761                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
12762                                 state->dts_cred.dcr_visible |=
12763                                     DTRACE_CRV_ALLZONE;
12764
12765                                 state->dts_cred.dcr_action |=
12766                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12767                         }
12768
12769                         /*
12770                          * If we have all privs in whatever zone this is,
12771                          * we can do destructive things to processes which
12772                          * have altered credentials.
12773                          *
12774                          * APPLE NOTE: Darwin doesn't do zones.
12775                          * Behave as if zone always has destructive privs.
12776                          */
12777
12778                         state->dts_cred.dcr_action |=
12779                                 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12780                 }
12781
12782                 /*
12783                  * Holding the dtrace_kernel privilege also implies that
12784                  * the user has the dtrace_user privilege from a visibility
12785                  * perspective.  But without further privileges, some
12786                  * destructive actions are not available.
12787                  */
12788                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
12789                         /*
12790                          * Make all probes in all zones visible.  However,
12791                          * this doesn't mean that all actions become available
12792                          * to all zones.
12793                          */
12794                         state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
12795                             DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
12796
12797                         state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
12798                             DTRACE_CRA_PROC;
12799                         /*
12800                          * Holding proc_owner means that destructive actions
12801                          * for *this* zone are allowed.
12802                          */
12803                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12804                                 state->dts_cred.dcr_action |=
12805                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12806
12807                         /*
12808                          * Holding proc_zone means that destructive actions
12809                          * for this user/group ID in all zones is allowed.
12810                          */
12811                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12812                                 state->dts_cred.dcr_action |=
12813                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12814
12815                         /*
12816                          * If we have all privs in whatever zone this is,
12817                          * we can do destructive things to processes which
12818                          * have altered credentials.
12819                          *
12820                          * APPLE NOTE: Darwin doesn't do zones.
12821                          * Behave as if zone always has destructive privs.
12822                          */
12823                         state->dts_cred.dcr_action |=
12824                                 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12825                 }
12826
12827                 /*
12828                  * Holding the dtrace_proc privilege gives control over fasttrap
12829                  * and pid providers.  We need to grant wider destructive
12830                  * privileges in the event that the user has proc_owner and/or
12831                  * proc_zone.
12832                  */
12833                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12834                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12835                                 state->dts_cred.dcr_action |=
12836                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12837
12838                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12839                                 state->dts_cred.dcr_action |=
12840                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12841                 }
12842         }
12843
12844         *new_state = state;
12845         return(0);  /* Success */
12846 }
12847
12848 static int
12849 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
12850 {
12851         dtrace_optval_t *opt = state->dts_options, size;
12852         processorid_t cpu = 0;
12853         int flags = 0, rval;
12854
12855         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12856         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12857         ASSERT(which < DTRACEOPT_MAX);
12858         ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
12859             (state == dtrace_anon.dta_state &&
12860             state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
12861
12862         if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
12863                 return (0);
12864
12865         if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
12866                 cpu = opt[DTRACEOPT_CPU];
12867
12868         if (which == DTRACEOPT_SPECSIZE)
12869                 flags |= DTRACEBUF_NOSWITCH;
12870
12871         if (which == DTRACEOPT_BUFSIZE) {
12872                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
12873                         flags |= DTRACEBUF_RING;
12874
12875                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
12876                         flags |= DTRACEBUF_FILL;
12877
12878                 if (state != dtrace_anon.dta_state ||
12879                     state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
12880                         flags |= DTRACEBUF_INACTIVE;
12881         }
12882
12883         for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
12884                 /*
12885                  * The size must be 8-byte aligned.  If the size is not 8-byte
12886                  * aligned, drop it down by the difference.
12887                  */
12888                 if (size & (sizeof (uint64_t) - 1))
12889                         size -= size & (sizeof (uint64_t) - 1);
12890
12891                 if (size < state->dts_reserve) {
12892                         /*
12893                          * Buffers always must be large enough to accommodate
12894                          * their prereserved space.  We return E2BIG instead
12895                          * of ENOMEM in this case to allow for user-level
12896                          * software to differentiate the cases.
12897                          */
12898                         return (E2BIG);
12899                 }
12900
12901                 rval = dtrace_buffer_alloc(buf, size, flags, cpu);
12902
12903                 if (rval != ENOMEM) {
12904                         opt[which] = size;
12905                         return (rval);
12906                 }
12907
12908                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12909                         return (rval);
12910         }
12911
12912         return (ENOMEM);
12913 }
12914
12915 static int
12916 dtrace_state_buffers(dtrace_state_t *state)
12917 {
12918         dtrace_speculation_t *spec = state->dts_speculations;
12919         int rval, i;
12920
12921         if ((rval = dtrace_state_buffer(state, state->dts_buffer,
12922             DTRACEOPT_BUFSIZE)) != 0)
12923                 return (rval);
12924
12925         if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
12926             DTRACEOPT_AGGSIZE)) != 0)
12927                 return (rval);
12928
12929         for (i = 0; i < state->dts_nspeculations; i++) {
12930                 if ((rval = dtrace_state_buffer(state,
12931                     spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
12932                         return (rval);
12933         }
12934
12935         return (0);
12936 }
12937
12938 static void
12939 dtrace_state_prereserve(dtrace_state_t *state)
12940 {
12941         dtrace_ecb_t *ecb;
12942         dtrace_probe_t *probe;
12943
12944         state->dts_reserve = 0;
12945
12946         if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
12947                 return;
12948
12949         /*
12950          * If our buffer policy is a "fill" buffer policy, we need to set the
12951          * prereserved space to be the space required by the END probes.
12952          */
12953         probe = dtrace_probes[dtrace_probeid_end - 1];
12954         ASSERT(probe != NULL);
12955
12956         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
12957                 if (ecb->dte_state != state)
12958                         continue;
12959
12960                 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
12961         }
12962 }
12963
12964 static int
12965 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
12966 {
12967         dtrace_optval_t *opt = state->dts_options, sz, nspec;
12968         dtrace_speculation_t *spec;
12969         dtrace_buffer_t *buf;
12970         cyc_handler_t hdlr;
12971         cyc_time_t when;
12972         int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
12973         dtrace_icookie_t cookie;
12974
12975         lck_mtx_lock(&cpu_lock);
12976         lck_mtx_lock(&dtrace_lock);
12977
12978         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
12979                 rval = EBUSY;
12980                 goto out;
12981         }
12982
12983         /*
12984          * Before we can perform any checks, we must prime all of the
12985          * retained enablings that correspond to this state.
12986          */
12987         dtrace_enabling_prime(state);
12988
12989         if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
12990                 rval = EACCES;
12991                 goto out;
12992         }
12993
12994         dtrace_state_prereserve(state);
12995
12996         /*
12997          * Now we want to do is try to allocate our speculations.
12998          * We do not automatically resize the number of speculations; if
12999          * this fails, we will fail the operation.
13000          */
13001         nspec = opt[DTRACEOPT_NSPEC];
13002         ASSERT(nspec != DTRACEOPT_UNSET);
13003
13004         if (nspec > INT_MAX) {
13005                 rval = ENOMEM;
13006                 goto out;
13007         }
13008
13009         spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
13010
13011         if (spec == NULL) {
13012                 rval = ENOMEM;
13013                 goto out;
13014         }
13015
13016         state->dts_speculations = spec;
13017         state->dts_nspeculations = (int)nspec;
13018
13019         for (i = 0; i < nspec; i++) {
13020                 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
13021                         rval = ENOMEM;
13022                         goto err;
13023                 }
13024
13025                 spec[i].dtsp_buffer = buf;
13026         }
13027
13028         if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
13029                 if (dtrace_anon.dta_state == NULL) {
13030                         rval = ENOENT;
13031                         goto out;
13032                 }
13033
13034                 if (state->dts_necbs != 0) {
13035                         rval = EALREADY;
13036                         goto out;
13037                 }
13038
13039                 state->dts_anon = dtrace_anon_grab();
13040                 ASSERT(state->dts_anon != NULL);
13041                 state = state->dts_anon;
13042
13043                 /*
13044                  * We want "grabanon" to be set in the grabbed state, so we'll
13045                  * copy that option value from the grabbing state into the
13046                  * grabbed state.
13047                  */
13048                 state->dts_options[DTRACEOPT_GRABANON] =
13049                     opt[DTRACEOPT_GRABANON];
13050
13051                 *cpu = dtrace_anon.dta_beganon;
13052
13053                 /*
13054                  * If the anonymous state is active (as it almost certainly
13055                  * is if the anonymous enabling ultimately matched anything),
13056                  * we don't allow any further option processing -- but we
13057                  * don't return failure.
13058                  */
13059                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13060                         goto out;
13061         }
13062
13063         if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
13064             opt[DTRACEOPT_AGGSIZE] != 0) {
13065                 if (state->dts_aggregations == NULL) {
13066                         /*
13067                          * We're not going to create an aggregation buffer
13068                          * because we don't have any ECBs that contain
13069                          * aggregations -- set this option to 0.
13070                          */
13071                         opt[DTRACEOPT_AGGSIZE] = 0;
13072                 } else {
13073                         /*
13074                          * If we have an aggregation buffer, we must also have
13075                          * a buffer to use as scratch.
13076                          */
13077                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
13078                           (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
13079                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
13080                         }
13081                 }
13082         }
13083
13084         if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
13085             opt[DTRACEOPT_SPECSIZE] != 0) {
13086                 if (!state->dts_speculates) {
13087                         /*
13088                          * We're not going to create speculation buffers
13089                          * because we don't have any ECBs that actually
13090                          * speculate -- set the speculation size to 0.
13091                          */
13092                         opt[DTRACEOPT_SPECSIZE] = 0;
13093                 }
13094         }
13095
13096         /*
13097          * The bare minimum size for any buffer that we're actually going to
13098          * do anything to is sizeof (uint64_t).
13099          */
13100         sz = sizeof (uint64_t);
13101
13102         if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
13103             (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
13104             (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
13105                 /*
13106                  * A buffer size has been explicitly set to 0 (or to a size
13107                  * that will be adjusted to 0) and we need the space -- we
13108                  * need to return failure.  We return ENOSPC to differentiate
13109                  * it from failing to allocate a buffer due to failure to meet
13110                  * the reserve (for which we return E2BIG).
13111                  */
13112                 rval = ENOSPC;
13113                 goto out;
13114         }
13115
13116         if ((rval = dtrace_state_buffers(state)) != 0)
13117                 goto err;
13118
13119         if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
13120                 sz = dtrace_dstate_defsize;
13121
13122         do {
13123                 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
13124
13125                 if (rval == 0)
13126                         break;
13127
13128                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13129                         goto err;
13130         } while (sz >>= 1);
13131
13132         opt[DTRACEOPT_DYNVARSIZE] = sz;
13133
13134         if (rval != 0)
13135                 goto err;
13136
13137         if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
13138                 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
13139
13140         if (opt[DTRACEOPT_CLEANRATE] == 0)
13141                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13142
13143         if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
13144                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
13145
13146         if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
13147                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13148
13149         hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
13150         hdlr.cyh_arg = state;
13151         hdlr.cyh_level = CY_LOW_LEVEL;
13152
13153         when.cyt_when = 0;
13154         when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
13155
13156         state->dts_cleaner = cyclic_add(&hdlr, &when);
13157
13158         hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
13159         hdlr.cyh_arg = state;
13160         hdlr.cyh_level = CY_LOW_LEVEL;
13161
13162         when.cyt_when = 0;
13163         when.cyt_interval = dtrace_deadman_interval;
13164
13165         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13166         state->dts_deadman = cyclic_add(&hdlr, &when);
13167
13168         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13169
13170         /*
13171          * Now it's time to actually fire the BEGIN probe.  We need to disable
13172          * interrupts here both to record the CPU on which we fired the BEGIN
13173          * probe (the data from this CPU will be processed first at user
13174          * level) and to manually activate the buffer for this CPU.
13175          */
13176         cookie = dtrace_interrupt_disable();
13177         *cpu = CPU->cpu_id;
13178         ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13179         state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13180
13181         dtrace_probe(dtrace_probeid_begin,
13182             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13183         dtrace_interrupt_enable(cookie);
13184         /*
13185          * We may have had an exit action from a BEGIN probe; only change our
13186          * state to ACTIVE if we're still in WARMUP.
13187          */
13188         ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
13189             state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13190
13191         if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13192                 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13193
13194         /*
13195          * Regardless of whether or not now we're in ACTIVE or DRAINING, we
13196          * want each CPU to transition its principal buffer out of the
13197          * INACTIVE state.  Doing this assures that no CPU will suddenly begin
13198          * processing an ECB halfway down a probe's ECB chain; all CPUs will
13199          * atomically transition from processing none of a state's ECBs to
13200          * processing all of them.
13201          */
13202         dtrace_xcall(DTRACE_CPUALL,
13203             (dtrace_xcall_t)dtrace_buffer_activate, state);
13204         goto out;
13205
13206 err:
13207         dtrace_buffer_free(state->dts_buffer);
13208         dtrace_buffer_free(state->dts_aggbuffer);
13209
13210         if ((nspec = state->dts_nspeculations) == 0) {
13211                 ASSERT(state->dts_speculations == NULL);
13212                 goto out;
13213         }
13214
13215         spec = state->dts_speculations;
13216         ASSERT(spec != NULL);
13217
13218         for (i = 0; i < state->dts_nspeculations; i++) {
13219                 if ((buf = spec[i].dtsp_buffer) == NULL)
13220                         break;
13221
13222                 dtrace_buffer_free(buf);
13223                 kmem_free(buf, bufsize);
13224         }
13225
13226         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13227         state->dts_nspeculations = 0;
13228         state->dts_speculations = NULL;
13229
13230 out:
13231         lck_mtx_unlock(&dtrace_lock);
13232         lck_mtx_unlock(&cpu_lock);
13233
13234         return (rval);
13235 }
13236
13237 static int
13238 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13239 {
13240         dtrace_icookie_t cookie;
13241
13242         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13243
13244         if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13245             state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13246                 return (EINVAL);
13247
13248         /*
13249          * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13250          * to be sure that every CPU has seen it.  See below for the details
13251          * on why this is done.
13252          */
13253         state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13254         dtrace_sync();
13255
13256         /*
13257          * By this point, it is impossible for any CPU to be still processing
13258          * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
13259          * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13260          * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
13261          * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13262          * iff we're in the END probe.
13263          */
13264         state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13265         dtrace_sync();
13266         ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13267
13268         /*
13269          * Finally, we can release the reserve and call the END probe.  We
13270          * disable interrupts across calling the END probe to allow us to
13271          * return the CPU on which we actually called the END probe.  This
13272          * allows user-land to be sure that this CPU's principal buffer is
13273          * processed last.
13274          */
13275         state->dts_reserve = 0;
13276
13277         cookie = dtrace_interrupt_disable();
13278         *cpu = CPU->cpu_id;
13279         dtrace_probe(dtrace_probeid_end,
13280             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13281         dtrace_interrupt_enable(cookie);
13282
13283         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13284         dtrace_sync();
13285
13286         return (0);
13287 }
13288
13289 static int
13290 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13291     dtrace_optval_t val)
13292 {
13293         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13294
13295         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13296                 return (EBUSY);
13297
13298         if (option >= DTRACEOPT_MAX)
13299                 return (EINVAL);
13300
13301         if (option != DTRACEOPT_CPU && val < 0)
13302                 return (EINVAL);
13303
13304         switch (option) {
13305         case DTRACEOPT_DESTRUCTIVE:
13306                 /*
13307                  * Prevent consumers from enabling destructive actions if DTrace
13308                  * is running in a restricted environment, or if actions are
13309                  * disallowed.
13310                  */
13311                 if (dtrace_is_restricted() || dtrace_destructive_disallow)
13312                         return (EACCES);
13313
13314                 state->dts_cred.dcr_destructive = 1;
13315                 break;
13316
13317         case DTRACEOPT_BUFSIZE:
13318         case DTRACEOPT_DYNVARSIZE:
13319         case DTRACEOPT_AGGSIZE:
13320         case DTRACEOPT_SPECSIZE:
13321         case DTRACEOPT_STRSIZE:
13322                 if (val < 0)
13323                         return (EINVAL);
13324
13325                 if (val >= LONG_MAX) {
13326                         /*
13327                          * If this is an otherwise negative value, set it to
13328                          * the highest multiple of 128m less than LONG_MAX.
13329                          * Technically, we're adjusting the size without
13330                          * regard to the buffer resizing policy, but in fact,
13331                          * this has no effect -- if we set the buffer size to
13332                          * ~LONG_MAX and the buffer policy is ultimately set to
13333                          * be "manual", the buffer allocation is guaranteed to
13334                          * fail, if only because the allocation requires two
13335                          * buffers.  (We set the the size to the highest
13336                          * multiple of 128m because it ensures that the size
13337                          * will remain a multiple of a megabyte when
13338                          * repeatedly halved -- all the way down to 15m.)
13339                          */
13340                         val = LONG_MAX - (1 << 27) + 1;
13341                 }
13342         }
13343
13344         state->dts_options[option] = val;
13345
13346         return (0);
13347 }
13348
13349 static void
13350 dtrace_state_destroy(dtrace_state_t *state)
13351 {
13352         dtrace_ecb_t *ecb;
13353         dtrace_vstate_t *vstate = &state->dts_vstate;
13354         minor_t minor = getminor(state->dts_dev);
13355         int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
13356         dtrace_speculation_t *spec = state->dts_speculations;
13357         int nspec = state->dts_nspeculations;
13358         uint32_t match;
13359
13360         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13361         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13362
13363         /*
13364          * First, retract any retained enablings for this state.
13365          */
13366         dtrace_enabling_retract(state);
13367         ASSERT(state->dts_nretained == 0);
13368
13369         if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
13370             state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
13371                 /*
13372                  * We have managed to come into dtrace_state_destroy() on a
13373                  * hot enabling -- almost certainly because of a disorderly
13374                  * shutdown of a consumer.  (That is, a consumer that is
13375                  * exiting without having called dtrace_stop().) In this case,
13376                  * we're going to set our activity to be KILLED, and then
13377                  * issue a sync to be sure that everyone is out of probe
13378                  * context before we start blowing away ECBs.
13379                  */
13380                 state->dts_activity = DTRACE_ACTIVITY_KILLED;
13381                 dtrace_sync();
13382         }
13383
13384         /*
13385          * Release the credential hold we took in dtrace_state_create().
13386          */
13387         if (state->dts_cred.dcr_cred != NULL)
13388                 crfree(state->dts_cred.dcr_cred);
13389
13390         /*
13391          * Now we can safely disable and destroy any enabled probes.  Because
13392          * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
13393          * (especially if they're all enabled), we take two passes through the
13394          * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
13395          * in the second we disable whatever is left over.
13396          */
13397         for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
13398                 for (i = 0; i < state->dts_necbs; i++) {
13399                         if ((ecb = state->dts_ecbs[i]) == NULL)
13400                                 continue;
13401
13402                         if (match && ecb->dte_probe != NULL) {
13403                                 dtrace_probe_t *probe = ecb->dte_probe;
13404                                 dtrace_provider_t *prov = probe->dtpr_provider;
13405
13406                                 if (!(prov->dtpv_priv.dtpp_flags & match))
13407                                         continue;
13408                         }
13409
13410                         dtrace_ecb_disable(ecb);
13411                         dtrace_ecb_destroy(ecb);
13412                 }
13413
13414                 if (!match)
13415                         break;
13416         }
13417
13418         /*
13419          * Before we free the buffers, perform one more sync to assure that
13420          * every CPU is out of probe context.
13421          */
13422         dtrace_sync();
13423
13424         dtrace_buffer_free(state->dts_buffer);
13425         dtrace_buffer_free(state->dts_aggbuffer);
13426
13427         for (i = 0; i < nspec; i++)
13428                 dtrace_buffer_free(spec[i].dtsp_buffer);
13429
13430         if (state->dts_cleaner != CYCLIC_NONE)
13431                 cyclic_remove(state->dts_cleaner);
13432
13433         if (state->dts_deadman != CYCLIC_NONE)
13434                 cyclic_remove(state->dts_deadman);
13435
13436         dtrace_dstate_fini(&vstate->dtvs_dynvars);
13437         dtrace_vstate_fini(vstate);
13438         kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
13439
13440         if (state->dts_aggregations != NULL) {
13441 #if DEBUG
13442                 for (i = 0; i < state->dts_naggregations; i++)
13443                         ASSERT(state->dts_aggregations[i] == NULL);
13444 #endif
13445                 ASSERT(state->dts_naggregations > 0);
13446                 kmem_free(state->dts_aggregations,
13447                     state->dts_naggregations * sizeof (dtrace_aggregation_t *));
13448         }
13449
13450         kmem_free(state->dts_buffer, bufsize);
13451         kmem_free(state->dts_aggbuffer, bufsize);
13452
13453         for (i = 0; i < nspec; i++)
13454                 kmem_free(spec[i].dtsp_buffer, bufsize);
13455
13456         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13457
13458         dtrace_format_destroy(state);
13459
13460         vmem_destroy(state->dts_aggid_arena);
13461         ddi_soft_state_free(dtrace_softstate, minor);
13462         vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13463 }
13464
13465 /*
13466  * DTrace Anonymous Enabling Functions
13467  */
13468 static dtrace_state_t *
13469 dtrace_anon_grab(void)
13470 {
13471         dtrace_state_t *state;
13472
13473         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13474
13475         if ((state = dtrace_anon.dta_state) == NULL) {
13476                 ASSERT(dtrace_anon.dta_enabling == NULL);
13477                 return (NULL);
13478         }
13479
13480         ASSERT(dtrace_anon.dta_enabling != NULL);
13481         ASSERT(dtrace_retained != NULL);
13482
13483         dtrace_enabling_destroy(dtrace_anon.dta_enabling);
13484         dtrace_anon.dta_enabling = NULL;
13485         dtrace_anon.dta_state = NULL;
13486
13487         return (state);
13488 }
13489
13490 static void
13491 dtrace_anon_property(void)
13492 {
13493         int i, rv;
13494         dtrace_state_t *state;
13495         dof_hdr_t *dof;
13496         char c[32];             /* enough for "dof-data-" + digits */
13497
13498         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13499         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13500
13501         for (i = 0; ; i++) {
13502                 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
13503
13504                 dtrace_err_verbose = 1;
13505
13506                 if ((dof = dtrace_dof_property(c)) == NULL) {
13507                         dtrace_err_verbose = 0;
13508                         break;
13509                 }
13510
13511                 /*
13512                  * We want to create anonymous state, so we need to transition
13513                  * the kernel debugger to indicate that DTrace is active.  If
13514                  * this fails (e.g. because the debugger has modified text in
13515                  * some way), we won't continue with the processing.
13516                  */
13517                 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
13518                         cmn_err(CE_NOTE, "kernel debugger active; anonymous "
13519                             "enabling ignored.");
13520                         dtrace_dof_destroy(dof);
13521                         break;
13522                 }
13523
13524                 /*
13525                  * If we haven't allocated an anonymous state, we'll do so now.
13526                  */
13527                 if ((state = dtrace_anon.dta_state) == NULL) {
13528                         rv = dtrace_state_create(NULL, NULL, &state);
13529                         dtrace_anon.dta_state = state;
13530                         if (rv != 0 || state == NULL) {
13531                                 /*
13532                                  * This basically shouldn't happen:  the only
13533                                  * failure mode from dtrace_state_create() is a
13534                                  * failure of ddi_soft_state_zalloc() that
13535                                  * itself should never happen.  Still, the
13536                                  * interface allows for a failure mode, and
13537                                  * we want to fail as gracefully as possible:
13538                                  * we'll emit an error message and cease
13539                                  * processing anonymous state in this case.
13540                                  */
13541                                 cmn_err(CE_WARN, "failed to create "
13542                                     "anonymous state");
13543                                 dtrace_dof_destroy(dof);
13544                                 break;
13545                         }
13546                 }
13547
13548                 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
13549                     &dtrace_anon.dta_enabling, 0, B_TRUE);
13550
13551                 if (rv == 0)
13552                         rv = dtrace_dof_options(dof, state);
13553
13554                 dtrace_err_verbose = 0;
13555                 dtrace_dof_destroy(dof);
13556
13557                 if (rv != 0) {
13558                         /*
13559                          * This is malformed DOF; chuck any anonymous state
13560                          * that we created.
13561                          */
13562                         ASSERT(dtrace_anon.dta_enabling == NULL);
13563                         dtrace_state_destroy(state);
13564                         dtrace_anon.dta_state = NULL;
13565                         break;
13566                 }
13567
13568                 ASSERT(dtrace_anon.dta_enabling != NULL);
13569         }
13570
13571         if (dtrace_anon.dta_enabling != NULL) {
13572                 int rval;
13573
13574                 /*
13575                  * dtrace_enabling_retain() can only fail because we are
13576                  * trying to retain more enablings than are allowed -- but
13577                  * we only have one anonymous enabling, and we are guaranteed
13578                  * to be allowed at least one retained enabling; we assert
13579                  * that dtrace_enabling_retain() returns success.
13580                  */
13581                 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
13582                 ASSERT(rval == 0);
13583
13584                 dtrace_enabling_dump(dtrace_anon.dta_enabling);
13585         }
13586 }
13587
13588 /*
13589  * DTrace Helper Functions
13590  */
13591 static void
13592 dtrace_helper_trace(dtrace_helper_action_t *helper,
13593     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
13594 {
13595         uint32_t size, next, nnext;
13596         int i;
13597         dtrace_helptrace_t *ent;
13598         uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13599
13600         if (!dtrace_helptrace_enabled)
13601                 return;
13602
13603         ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
13604
13605         /*
13606          * What would a tracing framework be without its own tracing
13607          * framework?  (Well, a hell of a lot simpler, for starters...)
13608          */
13609         size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
13610             sizeof (uint64_t) - sizeof (uint64_t);
13611
13612         /*
13613          * Iterate until we can allocate a slot in the trace buffer.
13614          */
13615         do {
13616                 next = dtrace_helptrace_next;
13617
13618                 if (next + size < dtrace_helptrace_bufsize) {
13619                         nnext = next + size;
13620                 } else {
13621                         nnext = size;
13622                 }
13623         } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
13624
13625         /*
13626          * We have our slot; fill it in.
13627          */
13628         if (nnext == size)
13629                 next = 0;
13630
13631         ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
13632         ent->dtht_helper = helper;
13633         ent->dtht_where = where;
13634         ent->dtht_nlocals = vstate->dtvs_nlocals;
13635
13636         ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
13637             mstate->dtms_fltoffs : -1;
13638         ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
13639         ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
13640
13641         for (i = 0; i < vstate->dtvs_nlocals; i++) {
13642                 dtrace_statvar_t *svar;
13643
13644                 if ((svar = vstate->dtvs_locals[i]) == NULL)
13645                         continue;
13646
13647                 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
13648                 ent->dtht_locals[i] =
13649                     ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
13650         }
13651 }
13652
13653 static uint64_t
13654 dtrace_helper(int which, dtrace_mstate_t *mstate,
13655     dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
13656 {
13657         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13658         uint64_t sarg0 = mstate->dtms_arg[0];
13659         uint64_t sarg1 = mstate->dtms_arg[1];
13660         uint64_t rval = 0;
13661         dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
13662         dtrace_helper_action_t *helper;
13663         dtrace_vstate_t *vstate;
13664         dtrace_difo_t *pred;
13665         int i, trace = dtrace_helptrace_enabled;
13666
13667         ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
13668
13669         if (helpers == NULL)
13670                 return (0);
13671
13672         if ((helper = helpers->dthps_actions[which]) == NULL)
13673                 return (0);
13674
13675         vstate = &helpers->dthps_vstate;
13676         mstate->dtms_arg[0] = arg0;
13677         mstate->dtms_arg[1] = arg1;
13678
13679         /*
13680          * Now iterate over each helper.  If its predicate evaluates to 'true',
13681          * we'll call the corresponding actions.  Note that the below calls
13682          * to dtrace_dif_emulate() may set faults in machine state.  This is
13683          * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
13684          * the stored DIF offset with its own (which is the desired behavior).
13685          * Also, note the calls to dtrace_dif_emulate() may allocate scratch
13686          * from machine state; this is okay, too.
13687          */
13688         for (; helper != NULL; helper = helper->dtha_next) {
13689                 if ((pred = helper->dtha_predicate) != NULL) {
13690                         if (trace)
13691                                 dtrace_helper_trace(helper, mstate, vstate, 0);
13692
13693                         if (!dtrace_dif_emulate(pred, mstate, vstate, state))
13694                                 goto next;
13695
13696                         if (*flags & CPU_DTRACE_FAULT)
13697                                 goto err;
13698                 }
13699
13700                 for (i = 0; i < helper->dtha_nactions; i++) {
13701                         if (trace)
13702                                 dtrace_helper_trace(helper,
13703                                     mstate, vstate, i + 1);
13704
13705                         rval = dtrace_dif_emulate(helper->dtha_actions[i],
13706                             mstate, vstate, state);
13707
13708                         if (*flags & CPU_DTRACE_FAULT)
13709                                 goto err;
13710                 }
13711
13712 next:
13713                 if (trace)
13714                         dtrace_helper_trace(helper, mstate, vstate,
13715                             DTRACE_HELPTRACE_NEXT);
13716         }
13717
13718         if (trace)
13719                 dtrace_helper_trace(helper, mstate, vstate,
13720                     DTRACE_HELPTRACE_DONE);
13721
13722         /*
13723          * Restore the arg0 that we saved upon entry.
13724          */
13725         mstate->dtms_arg[0] = sarg0;
13726         mstate->dtms_arg[1] = sarg1;
13727
13728         return (rval);
13729
13730 err:
13731         if (trace)
13732                 dtrace_helper_trace(helper, mstate, vstate,
13733                     DTRACE_HELPTRACE_ERR);
13734
13735         /*
13736          * Restore the arg0 that we saved upon entry.
13737          */
13738         mstate->dtms_arg[0] = sarg0;
13739         mstate->dtms_arg[1] = sarg1;
13740
13741         return (0);
13742 }
13743
13744 static void
13745 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
13746     dtrace_vstate_t *vstate)
13747 {
13748         int i;
13749
13750         if (helper->dtha_predicate != NULL)
13751                 dtrace_difo_release(helper->dtha_predicate, vstate);
13752
13753         for (i = 0; i < helper->dtha_nactions; i++) {
13754                 ASSERT(helper->dtha_actions[i] != NULL);
13755                 dtrace_difo_release(helper->dtha_actions[i], vstate);
13756         }
13757
13758         kmem_free(helper->dtha_actions,
13759             helper->dtha_nactions * sizeof (dtrace_difo_t *));
13760         kmem_free(helper, sizeof (dtrace_helper_action_t));
13761 }
13762
13763 static int
13764 dtrace_helper_destroygen(proc_t* p, int gen)
13765 {
13766         dtrace_helpers_t *help = p->p_dtrace_helpers;
13767         dtrace_vstate_t *vstate;
13768         uint_t i;
13769
13770         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13771
13772         if (help == NULL || gen > help->dthps_generation)
13773                 return (EINVAL);
13774
13775         vstate = &help->dthps_vstate;
13776
13777         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13778                 dtrace_helper_action_t *last = NULL, *h, *next;
13779
13780                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
13781                         next = h->dtha_next;
13782
13783                         if (h->dtha_generation == gen) {
13784                                 if (last != NULL) {
13785                                         last->dtha_next = next;
13786                                 } else {
13787                                         help->dthps_actions[i] = next;
13788                                 }
13789
13790                                 dtrace_helper_action_destroy(h, vstate);
13791                         } else {
13792                                 last = h;
13793                         }
13794                 }
13795         }
13796
13797         /*
13798          * Interate until we've cleared out all helper providers with the
13799          * given generation number.
13800          */
13801         for (;;) {
13802                 dtrace_helper_provider_t *prov = NULL;
13803
13804                 /*
13805                  * Look for a helper provider with the right generation. We
13806                  * have to start back at the beginning of the list each time
13807                  * because we drop dtrace_lock. It's unlikely that we'll make
13808                  * more than two passes.
13809                  */
13810                 for (i = 0; i < help->dthps_nprovs; i++) {
13811                         prov = help->dthps_provs[i];
13812
13813                         if (prov->dthp_generation == gen)
13814                                 break;
13815                 }
13816
13817                 /*
13818                  * If there were no matches, we're done.
13819                  */
13820                 if (i == help->dthps_nprovs)
13821                         break;
13822
13823                 /*
13824                  * Move the last helper provider into this slot.
13825                  */
13826                 help->dthps_nprovs--;
13827                 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
13828                 help->dthps_provs[help->dthps_nprovs] = NULL;
13829
13830                 lck_mtx_unlock(&dtrace_lock);
13831
13832                 /*
13833                  * If we have a meta provider, remove this helper provider.
13834                  */
13835                 lck_mtx_lock(&dtrace_meta_lock);
13836                 if (dtrace_meta_pid != NULL) {
13837                         ASSERT(dtrace_deferred_pid == NULL);
13838                         dtrace_helper_provider_remove(&prov->dthp_prov,
13839                             p->p_pid);
13840                 }
13841                 lck_mtx_unlock(&dtrace_meta_lock);
13842
13843                 dtrace_helper_provider_destroy(prov);
13844
13845                 lck_mtx_lock(&dtrace_lock);
13846         }
13847
13848         return (0);
13849 }
13850
13851 static int
13852 dtrace_helper_validate(dtrace_helper_action_t *helper)
13853 {
13854         int err = 0, i;
13855         dtrace_difo_t *dp;
13856
13857         if ((dp = helper->dtha_predicate) != NULL)
13858                 err += dtrace_difo_validate_helper(dp);
13859
13860         for (i = 0; i < helper->dtha_nactions; i++)
13861                 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
13862
13863         return (err == 0);
13864 }
13865
13866 static int
13867 dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
13868 {
13869         dtrace_helpers_t *help;
13870         dtrace_helper_action_t *helper, *last;
13871         dtrace_actdesc_t *act;
13872         dtrace_vstate_t *vstate;
13873         dtrace_predicate_t *pred;
13874         int count = 0, nactions = 0, i;
13875
13876         if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
13877                 return (EINVAL);
13878
13879         help = p->p_dtrace_helpers;
13880         last = help->dthps_actions[which];
13881         vstate = &help->dthps_vstate;
13882
13883         for (count = 0; last != NULL; last = last->dtha_next) {
13884                 count++;
13885                 if (last->dtha_next == NULL)
13886                         break;
13887         }
13888
13889         /*
13890          * If we already have dtrace_helper_actions_max helper actions for this
13891          * helper action type, we'll refuse to add a new one.
13892          */
13893         if (count >= dtrace_helper_actions_max)
13894                 return (ENOSPC);
13895
13896         helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
13897         helper->dtha_generation = help->dthps_generation;
13898
13899         if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
13900                 ASSERT(pred->dtp_difo != NULL);
13901                 dtrace_difo_hold(pred->dtp_difo);
13902                 helper->dtha_predicate = pred->dtp_difo;
13903         }
13904
13905         for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
13906                 if (act->dtad_kind != DTRACEACT_DIFEXPR)
13907                         goto err;
13908
13909                 if (act->dtad_difo == NULL)
13910                         goto err;
13911
13912                 nactions++;
13913         }
13914
13915         helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
13916             (helper->dtha_nactions = nactions), KM_SLEEP);
13917
13918         for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
13919                 dtrace_difo_hold(act->dtad_difo);
13920                 helper->dtha_actions[i++] = act->dtad_difo;
13921         }
13922
13923         if (!dtrace_helper_validate(helper))
13924                 goto err;
13925
13926         if (last == NULL) {
13927                 help->dthps_actions[which] = helper;
13928         } else {
13929                 last->dtha_next = helper;
13930         }
13931
13932         if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
13933                 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
13934                 dtrace_helptrace_next = 0;
13935         }
13936
13937         return (0);
13938 err:
13939         dtrace_helper_action_destroy(helper, vstate);
13940         return (EINVAL);
13941 }
13942
13943 static void
13944 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
13945     dof_helper_t *dofhp)
13946 {
13947         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
13948
13949         lck_mtx_lock(&dtrace_meta_lock);
13950         lck_mtx_lock(&dtrace_lock);
13951
13952         if (!dtrace_attached() || dtrace_meta_pid == NULL) {
13953                 /*
13954                  * If the dtrace module is loaded but not attached, or if
13955                  * there aren't isn't a meta provider registered to deal with
13956                  * these provider descriptions, we need to postpone creating
13957                  * the actual providers until later.
13958                  */
13959
13960                 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
13961                     dtrace_deferred_pid != help) {
13962                         help->dthps_deferred = 1;
13963                         help->dthps_pid = p->p_pid;
13964                         help->dthps_next = dtrace_deferred_pid;
13965                         help->dthps_prev = NULL;
13966                         if (dtrace_deferred_pid != NULL)
13967                                 dtrace_deferred_pid->dthps_prev = help;
13968                         dtrace_deferred_pid = help;
13969                 }
13970
13971                 lck_mtx_unlock(&dtrace_lock);
13972
13973         } else if (dofhp != NULL) {
13974                 /*
13975                  * If the dtrace module is loaded and we have a particular
13976                  * helper provider description, pass that off to the
13977                  * meta provider.
13978                  */
13979
13980                 lck_mtx_unlock(&dtrace_lock);
13981
13982                 dtrace_helper_provide(dofhp, p->p_pid);
13983
13984         } else {
13985                 /*
13986                  * Otherwise, just pass all the helper provider descriptions
13987                  * off to the meta provider.
13988                  */
13989
13990                 uint_t i;
13991                 lck_mtx_unlock(&dtrace_lock);
13992
13993                 for (i = 0; i < help->dthps_nprovs; i++) {
13994                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
13995                             p->p_pid);
13996                 }
13997         }
13998
13999         lck_mtx_unlock(&dtrace_meta_lock);
14000 }
14001
14002 static int
14003 dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
14004 {
14005         dtrace_helpers_t *help;
14006         dtrace_helper_provider_t *hprov, **tmp_provs;
14007         uint_t tmp_maxprovs, i;
14008
14009         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14010         help = p->p_dtrace_helpers;
14011         ASSERT(help != NULL);
14012
14013         /*
14014          * If we already have dtrace_helper_providers_max helper providers,
14015          * we're refuse to add a new one.
14016          */
14017         if (help->dthps_nprovs >= dtrace_helper_providers_max)
14018                 return (ENOSPC);
14019
14020         /*
14021          * Check to make sure this isn't a duplicate.
14022          */
14023         for (i = 0; i < help->dthps_nprovs; i++) {
14024                 if (dofhp->dofhp_addr ==
14025                     help->dthps_provs[i]->dthp_prov.dofhp_addr)
14026                         return (EALREADY);
14027         }
14028
14029         hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
14030         hprov->dthp_prov = *dofhp;
14031         hprov->dthp_ref = 1;
14032         hprov->dthp_generation = gen;
14033
14034         /*
14035          * Allocate a bigger table for helper providers if it's already full.
14036          */
14037         if (help->dthps_maxprovs == help->dthps_nprovs) {
14038                 tmp_maxprovs = help->dthps_maxprovs;
14039                 tmp_provs = help->dthps_provs;
14040
14041                 if (help->dthps_maxprovs == 0)
14042                         help->dthps_maxprovs = 2;
14043                 else
14044                         help->dthps_maxprovs *= 2;
14045                 if (help->dthps_maxprovs > dtrace_helper_providers_max)
14046                         help->dthps_maxprovs = dtrace_helper_providers_max;
14047
14048                 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
14049
14050                 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
14051                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14052
14053                 if (tmp_provs != NULL) {
14054                         bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
14055                             sizeof (dtrace_helper_provider_t *));
14056                         kmem_free(tmp_provs, tmp_maxprovs *
14057                             sizeof (dtrace_helper_provider_t *));
14058                 }
14059         }
14060
14061         help->dthps_provs[help->dthps_nprovs] = hprov;
14062         help->dthps_nprovs++;
14063
14064         return (0);
14065 }
14066
14067 static void
14068 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
14069 {
14070         lck_mtx_lock(&dtrace_lock);
14071
14072         if (--hprov->dthp_ref == 0) {
14073                 dof_hdr_t *dof;
14074                 lck_mtx_unlock(&dtrace_lock);
14075                 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
14076                 dtrace_dof_destroy(dof);
14077                 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
14078         } else {
14079                 lck_mtx_unlock(&dtrace_lock);
14080         }
14081 }
14082
14083 static int
14084 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
14085 {
14086         uintptr_t daddr = (uintptr_t)dof;
14087         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
14088         dof_provider_t *provider;
14089         dof_probe_t *probe;
14090         uint8_t *arg;
14091         char *strtab, *typestr;
14092         dof_stridx_t typeidx;
14093         size_t typesz;
14094         uint_t nprobes, j, k;
14095
14096         ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
14097
14098         if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
14099                 dtrace_dof_error(dof, "misaligned section offset");
14100                 return (-1);
14101         }
14102
14103         /*
14104          * The section needs to be large enough to contain the DOF provider
14105          * structure appropriate for the given version.
14106          */
14107         if (sec->dofs_size <
14108             ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
14109             offsetof(dof_provider_t, dofpv_prenoffs) :
14110             sizeof (dof_provider_t))) {
14111                 dtrace_dof_error(dof, "provider section too small");
14112                 return (-1);
14113         }
14114
14115         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
14116         str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
14117         prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
14118         arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
14119         off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
14120
14121         if (str_sec == NULL || prb_sec == NULL ||
14122             arg_sec == NULL || off_sec == NULL)
14123                 return (-1);
14124
14125         enoff_sec = NULL;
14126
14127         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14128             provider->dofpv_prenoffs != DOF_SECT_NONE &&
14129             (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
14130             provider->dofpv_prenoffs)) == NULL)
14131                 return (-1);
14132
14133         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
14134
14135         if (provider->dofpv_name >= str_sec->dofs_size ||
14136             strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
14137                 dtrace_dof_error(dof, "invalid provider name");
14138                 return (-1);
14139         }
14140
14141         if (prb_sec->dofs_entsize == 0 ||
14142             prb_sec->dofs_entsize > prb_sec->dofs_size) {
14143                 dtrace_dof_error(dof, "invalid entry size");
14144                 return (-1);
14145         }
14146
14147         if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
14148                 dtrace_dof_error(dof, "misaligned entry size");
14149                 return (-1);
14150         }
14151
14152         if (off_sec->dofs_entsize != sizeof (uint32_t)) {
14153                 dtrace_dof_error(dof, "invalid entry size");
14154                 return (-1);
14155         }
14156
14157         if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
14158                 dtrace_dof_error(dof, "misaligned section offset");
14159                 return (-1);
14160         }
14161
14162         if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
14163                 dtrace_dof_error(dof, "invalid entry size");
14164                 return (-1);
14165         }
14166
14167         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
14168
14169         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14170
14171         /*
14172          * Take a pass through the probes to check for errors.
14173          */
14174         for (j = 0; j < nprobes; j++) {
14175                 probe = (dof_probe_t *)(uintptr_t)(daddr +
14176                     prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14177
14178                 if (probe->dofpr_func >= str_sec->dofs_size) {
14179                         dtrace_dof_error(dof, "invalid function name");
14180                         return (-1);
14181                 }
14182
14183                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14184                         dtrace_dof_error(dof, "function name too long");
14185                         return (-1);
14186                 }
14187
14188                 if (probe->dofpr_name >= str_sec->dofs_size ||
14189                     strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14190                         dtrace_dof_error(dof, "invalid probe name");
14191                         return (-1);
14192                 }
14193
14194                 /*
14195                  * The offset count must not wrap the index, and the offsets
14196                  * must also not overflow the section's data.
14197                  */
14198                 if (probe->dofpr_offidx + probe->dofpr_noffs <
14199                     probe->dofpr_offidx ||
14200                     (probe->dofpr_offidx + probe->dofpr_noffs) *
14201                     off_sec->dofs_entsize > off_sec->dofs_size) {
14202                         dtrace_dof_error(dof, "invalid probe offset");
14203                         return (-1);
14204                 }
14205
14206                 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14207                         /*
14208                          * If there's no is-enabled offset section, make sure
14209                          * there aren't any is-enabled offsets. Otherwise
14210                          * perform the same checks as for probe offsets
14211                          * (immediately above).
14212                          */
14213                         if (enoff_sec == NULL) {
14214                                 if (probe->dofpr_enoffidx != 0 ||
14215                                     probe->dofpr_nenoffs != 0) {
14216                                         dtrace_dof_error(dof, "is-enabled "
14217                                             "offsets with null section");
14218                                         return (-1);
14219                                 }
14220                         } else if (probe->dofpr_enoffidx +
14221                             probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14222                             (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14223                             enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14224                                 dtrace_dof_error(dof, "invalid is-enabled "
14225                                     "offset");
14226                                 return (-1);
14227                         }
14228
14229                         if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14230                                 dtrace_dof_error(dof, "zero probe and "
14231                                     "is-enabled offsets");
14232                                 return (-1);
14233                         }
14234                 } else if (probe->dofpr_noffs == 0) {
14235                         dtrace_dof_error(dof, "zero probe offsets");
14236                         return (-1);
14237                 }
14238
14239                 if (probe->dofpr_argidx + probe->dofpr_xargc <
14240                     probe->dofpr_argidx ||
14241                     (probe->dofpr_argidx + probe->dofpr_xargc) *
14242                     arg_sec->dofs_entsize > arg_sec->dofs_size) {
14243                         dtrace_dof_error(dof, "invalid args");
14244                         return (-1);
14245                 }
14246
14247                 typeidx = probe->dofpr_nargv;
14248                 typestr = strtab + probe->dofpr_nargv;
14249                 for (k = 0; k < probe->dofpr_nargc; k++) {
14250                         if (typeidx >= str_sec->dofs_size) {
14251                                 dtrace_dof_error(dof, "bad "
14252                                     "native argument type");
14253                                 return (-1);
14254                         }
14255
14256                         typesz = strlen(typestr) + 1;
14257                         if (typesz > DTRACE_ARGTYPELEN) {
14258                                 dtrace_dof_error(dof, "native "
14259                                     "argument type too long");
14260                                 return (-1);
14261                         }
14262                         typeidx += typesz;
14263                         typestr += typesz;
14264                 }
14265
14266                 typeidx = probe->dofpr_xargv;
14267                 typestr = strtab + probe->dofpr_xargv;
14268                 for (k = 0; k < probe->dofpr_xargc; k++) {
14269                         if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
14270                                 dtrace_dof_error(dof, "bad "
14271                                     "native argument index");
14272                                 return (-1);
14273                         }
14274
14275                         if (typeidx >= str_sec->dofs_size) {
14276                                 dtrace_dof_error(dof, "bad "
14277                                     "translated argument type");
14278                                 return (-1);
14279                         }
14280
14281                         typesz = strlen(typestr) + 1;
14282                         if (typesz > DTRACE_ARGTYPELEN) {
14283                                 dtrace_dof_error(dof, "translated argument "
14284                                     "type too long");
14285                                 return (-1);
14286                         }
14287
14288                         typeidx += typesz;
14289                         typestr += typesz;
14290                 }
14291         }
14292
14293         return (0);
14294 }
14295
14296 static int
14297 dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
14298 {
14299         dtrace_helpers_t *help;
14300         dtrace_vstate_t *vstate;
14301         dtrace_enabling_t *enab = NULL;
14302         int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
14303         uintptr_t daddr = (uintptr_t)dof;
14304
14305         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14306
14307         if ((help = p->p_dtrace_helpers) == NULL)
14308                 help = dtrace_helpers_create(p);
14309
14310         vstate = &help->dthps_vstate;
14311
14312         if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
14313             dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
14314                 dtrace_dof_destroy(dof);
14315                 return (rv);
14316         }
14317
14318         /*
14319          * Look for helper providers and validate their descriptions.
14320          */
14321         if (dhp != NULL) {
14322                 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
14323                         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
14324                             dof->dofh_secoff + i * dof->dofh_secsize);
14325
14326                         if (sec->dofs_type != DOF_SECT_PROVIDER)
14327                                 continue;
14328
14329                         if (dtrace_helper_provider_validate(dof, sec) != 0) {
14330                                 dtrace_enabling_destroy(enab);
14331                                 dtrace_dof_destroy(dof);
14332                                 return (-1);
14333                         }
14334
14335                         nprovs++;
14336                 }
14337         }
14338
14339         /*
14340          * Now we need to walk through the ECB descriptions in the enabling.
14341          */
14342         for (i = 0; i < enab->dten_ndesc; i++) {
14343                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
14344                 dtrace_probedesc_t *desc = &ep->dted_probe;
14345
14346                 /* APPLE NOTE: Darwin employs size bounded string operation. */
14347                 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
14348                         continue;
14349
14350                 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
14351                         continue;
14352
14353                 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
14354                         continue;
14355
14356                 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
14357                     ep)) != 0) {
14358                         /*
14359                          * Adding this helper action failed -- we are now going
14360                          * to rip out the entire generation and return failure.
14361                          */
14362                         (void) dtrace_helper_destroygen(p, help->dthps_generation);
14363                         dtrace_enabling_destroy(enab);
14364                         dtrace_dof_destroy(dof);
14365                         return (-1);
14366                 }
14367
14368                 nhelpers++;
14369         }
14370
14371         if (nhelpers < enab->dten_ndesc)
14372                 dtrace_dof_error(dof, "unmatched helpers");
14373
14374         gen = help->dthps_generation++;
14375         dtrace_enabling_destroy(enab);
14376
14377         if (dhp != NULL && nprovs > 0) {
14378                 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
14379                 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
14380                         lck_mtx_unlock(&dtrace_lock);
14381                         dtrace_helper_provider_register(p, help, dhp);
14382                         lck_mtx_lock(&dtrace_lock);
14383
14384                         destroy = 0;
14385                 }
14386         }
14387
14388         if (destroy)
14389                 dtrace_dof_destroy(dof);
14390
14391         return (gen);
14392 }
14393
14394 /*
14395  * APPLE NOTE:  DTrace lazy dof implementation
14396  *
14397  * DTrace user static probes (USDT probes) and helper actions are loaded
14398  * in a process by proccessing dof sections. The dof sections are passed
14399  * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
14400  * expensive to process dof for a process that will never use it. There
14401  * is a memory cost (allocating the providers/probes), and a cpu cost
14402  * (creating the providers/probes).
14403  *
14404  * To reduce this cost, we use "lazy dof". The normal proceedure for
14405  * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
14406  * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
14407  * used, each process retains the dof_ioctl_data_t block, instead of
14408  * copying in the data it points to.
14409  *
14410  * The dof_ioctl_data_t blocks are managed as if they were the actual
14411  * processed dof; on fork the block is copied to the child, on exec and
14412  * exit the block is freed.
14413  *
14414  * If the process loads library(s) containing additional dof, the
14415  * new dof_ioctl_data_t is merged with the existing block.
14416  *
14417  * There are a few catches that make this slightly more difficult.
14418  * When dyld registers dof_ioctl_data_t blocks, it expects a unique
14419  * identifier value for each dof in the block. In non-lazy dof terms,
14420  * this is the generation that dof was loaded in. If we hand back
14421  * a UID for a lazy dof, that same UID must be able to unload the
14422  * dof once it has become non-lazy. To meet this requirement, the
14423  * code that loads lazy dof requires that the UID's for dof(s) in
14424  * the lazy dof be sorted, and in ascending order. It is okay to skip
14425  * UID's, I.E., 1 -> 5 -> 6 is legal.
14426  *
14427  * Once a process has become non-lazy, it will stay non-lazy. All
14428  * future dof operations for that process will be non-lazy, even
14429  * if the dof mode transitions back to lazy.
14430  *
14431  * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
14432  * That way if the lazy check fails due to transitioning to non-lazy, the
14433  * right thing is done with the newly faulted in dof.
14434  */
14435
14436 /*
14437  * This method is a bit squicky. It must handle:
14438  *
14439  * dof should not be lazy.
14440  * dof should have been handled lazily, but there was an error
14441  * dof was handled lazily, and needs to be freed.
14442  * dof was handled lazily, and must not be freed.
14443  *
14444  *
14445  * Returns EACCESS if dof should be handled non-lazily.
14446  *
14447  * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
14448  *
14449  * If the dofs data is claimed by this method, dofs_claimed will be set.
14450  * Callers should not free claimed dofs.
14451  */
14452 static int
14453 dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
14454 {
14455         ASSERT(p);
14456         ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
14457
14458         int rval = 0;
14459         *dofs_claimed = 0;
14460
14461         lck_rw_lock_shared(&dtrace_dof_mode_lock);
14462
14463         /*
14464          * If we have lazy dof, dof mode better be LAZY_ON.
14465          */
14466         ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
14467         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
14468         ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
14469
14470         /*
14471          * Any existing helpers force non-lazy behavior.
14472          */
14473         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
14474                 lck_mtx_lock(&p->p_dtrace_sprlock);
14475
14476                 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
14477                 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
14478                 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
14479
14480                 /*
14481                  * Range check...
14482                  */
14483                 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
14484                         dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
14485                         rval = EINVAL;
14486                         goto unlock;
14487                 }
14488
14489                 /*
14490                  * Each dof being added must be assigned a unique generation.
14491                  */
14492                 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
14493                 for (i=0; i<incoming_dofs->dofiod_count; i++) {
14494                         /*
14495                          * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
14496                          */
14497                         ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
14498                         incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
14499                 }
14500
14501
14502                 if (existing_dofs) {
14503                         /*
14504                          * Merge the existing and incoming dofs
14505                          */
14506                         size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
14507                         dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
14508
14509                         bcopy(&existing_dofs->dofiod_helpers[0],
14510                               &merged_dofs->dofiod_helpers[0],
14511                               sizeof(dof_helper_t) * existing_dofs_count);
14512                         bcopy(&incoming_dofs->dofiod_helpers[0],
14513                               &merged_dofs->dofiod_helpers[existing_dofs_count],
14514                               sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
14515
14516                         merged_dofs->dofiod_count = merged_dofs_count;
14517
14518                         kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
14519
14520                         p->p_dtrace_lazy_dofs = merged_dofs;
14521                 } else {
14522                         /*
14523                          * Claim the incoming dofs
14524                          */
14525                         *dofs_claimed = 1;
14526                         p->p_dtrace_lazy_dofs = incoming_dofs;
14527                 }
14528
14529 #if DEBUG
14530                 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
14531                 for (i=0; i<all_dofs->dofiod_count-1; i++) {
14532                         ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
14533                 }
14534 #endif /* DEBUG */
14535
14536 unlock:
14537                 lck_mtx_unlock(&p->p_dtrace_sprlock);
14538         } else {
14539                 rval = EACCES;
14540         }
14541
14542         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
14543
14544         return rval;
14545 }
14546
14547 /*
14548  * Returns:
14549  *
14550  * EINVAL: lazy dof is enabled, but the requested generation was not found.
14551  * EACCES: This removal needs to be handled non-lazily.
14552  */
14553 static int
14554 dtrace_lazy_dofs_remove(proc_t *p, int generation)
14555 {
14556         int rval = EINVAL;
14557
14558         lck_rw_lock_shared(&dtrace_dof_mode_lock);
14559
14560         /*
14561          * If we have lazy dof, dof mode better be LAZY_ON.
14562          */
14563         ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
14564         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
14565         ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
14566
14567         /*
14568          * Any existing helpers force non-lazy behavior.
14569          */
14570         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
14571                 lck_mtx_lock(&p->p_dtrace_sprlock);
14572
14573                 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
14574
14575                 if (existing_dofs) {
14576                         int index, existing_dofs_count = existing_dofs->dofiod_count;
14577                         for (index=0; index<existing_dofs_count; index++) {
14578                                 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
14579                                         dof_ioctl_data_t* removed_dofs = NULL;
14580
14581                                         /*
14582                                          * If there is only 1 dof, we'll delete it and swap in NULL.
14583                                          */
14584                                         if (existing_dofs_count > 1) {
14585                                                 int removed_dofs_count = existing_dofs_count - 1;
14586                                                 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
14587
14588                                                 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
14589                                                 removed_dofs->dofiod_count = removed_dofs_count;
14590
14591                                                 /*
14592                                                  * copy the remaining data.
14593                                                  */
14594                                                 if (index > 0) {
14595                                                         bcopy(&existing_dofs->dofiod_helpers[0],
14596                                                               &removed_dofs->dofiod_helpers[0],
14597                                                               index * sizeof(dof_helper_t));
14598                                                 }
14599
14600                                                 if (index < existing_dofs_count-1) {
14601                                                         bcopy(&existing_dofs->dofiod_helpers[index+1],
14602                                                               &removed_dofs->dofiod_helpers[index],
14603                                                               (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
14604                                                 }
14605                                         }
14606
14607                                         kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
14608
14609                                         p->p_dtrace_lazy_dofs = removed_dofs;
14610
14611                                         rval = KERN_SUCCESS;
14612
14613                                         break;
14614                                 }
14615                         }
14616
14617 #if DEBUG
14618                         dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
14619                         if (all_dofs) {
14620                                 unsigned int i;
14621                                 for (i=0; i<all_dofs->dofiod_count-1; i++) {
14622                                         ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
14623                                 }
14624                         }
14625 #endif
14626
14627                 }
14628
14629                 lck_mtx_unlock(&p->p_dtrace_sprlock);
14630         } else {
14631                 rval = EACCES;
14632         }
14633
14634         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
14635
14636         return rval;
14637 }
14638
14639 void
14640 dtrace_lazy_dofs_destroy(proc_t *p)
14641 {
14642         lck_rw_lock_shared(&dtrace_dof_mode_lock);
14643         lck_mtx_lock(&p->p_dtrace_sprlock);
14644
14645         /*
14646          * If we have lazy dof, dof mode better be LAZY_ON, or we must be exiting.
14647          * We cannot assert against DTRACE_DOF_MODE_NEVER here, because we are called from
14648          * kern_exit.c and kern_exec.c.
14649          */
14650         ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON || p->p_lflag & P_LEXIT);
14651         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
14652
14653         dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
14654         p->p_dtrace_lazy_dofs = NULL;
14655
14656         lck_mtx_unlock(&p->p_dtrace_sprlock);
14657         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
14658
14659         if (lazy_dofs) {
14660                 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
14661         }
14662 }
14663
14664 void
14665 dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
14666 {
14667         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
14668         lck_mtx_assert(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
14669         lck_mtx_assert(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
14670
14671         lck_rw_lock_shared(&dtrace_dof_mode_lock);
14672         lck_mtx_lock(&parent->p_dtrace_sprlock);
14673
14674         /*
14675          * If we have lazy dof, dof mode better be LAZY_ON, or we must be exiting.
14676          * We cannot assert against DTRACE_DOF_MODE_NEVER here, because we are called from
14677          * kern_fork.c
14678          */
14679         ASSERT(parent->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
14680         ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
14681         /*
14682          * In theory we should hold the child sprlock, but this is safe...
14683          */
14684         ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
14685
14686         dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
14687         dof_ioctl_data_t* child_dofs = NULL;
14688         if (parent_dofs) {
14689                 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
14690                 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
14691                 bcopy(parent_dofs, child_dofs, parent_dofs_size);
14692         }
14693
14694         lck_mtx_unlock(&parent->p_dtrace_sprlock);
14695
14696         if (child_dofs) {
14697                 lck_mtx_lock(&child->p_dtrace_sprlock);
14698                 child->p_dtrace_lazy_dofs = child_dofs;
14699                 lck_mtx_unlock(&child->p_dtrace_sprlock);
14700         }
14701
14702         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
14703 }
14704
14705 static int
14706 dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
14707 {
14708 #pragma unused(ignored)
14709         /*
14710          * Okay to NULL test without taking the sprlock.
14711          */
14712         return p->p_dtrace_lazy_dofs != NULL;
14713 }
14714
14715 static int
14716 dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
14717 {
14718 #pragma unused(ignored)
14719         /*
14720          * It is possible this process may exit during our attempt to
14721          * fault in the dof. We could fix this by holding locks longer,
14722          * but the errors are benign.
14723          */
14724         lck_mtx_lock(&p->p_dtrace_sprlock);
14725
14726         /*
14727          * In this case only, it is okay to have lazy dof when dof mode is DTRACE_DOF_MODE_LAZY_OFF
14728          */
14729         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
14730         ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
14731
14732
14733         dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
14734         p->p_dtrace_lazy_dofs = NULL;
14735
14736         lck_mtx_unlock(&p->p_dtrace_sprlock);
14737
14738         /*
14739          * Process each dof_helper_t
14740          */
14741         if (lazy_dofs != NULL) {
14742                 unsigned int i;
14743                 int rval;
14744
14745                 for (i=0; i<lazy_dofs->dofiod_count; i++) {
14746                         /*
14747                          * When loading lazy dof, we depend on the generations being sorted in ascending order.
14748                          */
14749                         ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
14750
14751                         dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
14752
14753                         /*
14754                          * We stored the generation in dofhp_dof. Save it, and restore the original value.
14755                          */
14756                         int generation = dhp->dofhp_dof;
14757                         dhp->dofhp_dof = dhp->dofhp_addr;
14758
14759                         dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
14760
14761                         if (dof != NULL) {
14762                                 dtrace_helpers_t *help;
14763
14764                                 lck_mtx_lock(&dtrace_lock);
14765
14766                                 /*
14767                                  * This must be done with the dtrace_lock held
14768                                  */
14769                                 if ((help = p->p_dtrace_helpers) == NULL)
14770                                         help = dtrace_helpers_create(p);
14771
14772                                 /*
14773                                  * If the generation value has been bumped, someone snuck in
14774                                  * when we released the dtrace lock. We have to dump this generation,
14775                                  * there is no safe way to load it.
14776                                  */
14777                                 if (help->dthps_generation <= generation) {
14778                                         help->dthps_generation = generation;
14779
14780                                         /*
14781                                          * dtrace_helper_slurp() takes responsibility for the dof --
14782                                          * it may free it now or it may save it and free it later.
14783                                          */
14784                                         if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
14785                                                 dtrace_dof_error(NULL, "returned value did not match expected generation");
14786                                         }
14787                                 }
14788
14789                                 lck_mtx_unlock(&dtrace_lock);
14790                         }
14791                 }
14792
14793                 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
14794         }
14795
14796         return PROC_RETURNED;
14797 }
14798
14799 static dtrace_helpers_t *
14800 dtrace_helpers_create(proc_t *p)
14801 {
14802         dtrace_helpers_t *help;
14803
14804         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14805         ASSERT(p->p_dtrace_helpers == NULL);
14806
14807         help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
14808         help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
14809             DTRACE_NHELPER_ACTIONS, KM_SLEEP);
14810
14811         p->p_dtrace_helpers = help;
14812         dtrace_helpers++;
14813
14814         return (help);
14815 }
14816
14817 static void
14818 dtrace_helpers_destroy(proc_t* p)
14819 {
14820         dtrace_helpers_t *help;
14821         dtrace_vstate_t *vstate;
14822         uint_t i;
14823
14824         lck_mtx_lock(&dtrace_lock);
14825
14826         ASSERT(p->p_dtrace_helpers != NULL);
14827         ASSERT(dtrace_helpers > 0);
14828
14829         help = p->p_dtrace_helpers;
14830         vstate = &help->dthps_vstate;
14831
14832         /*
14833          * We're now going to lose the help from this process.
14834          */
14835         p->p_dtrace_helpers = NULL;
14836         dtrace_sync();
14837
14838         /*
14839          * Destory the helper actions.
14840          */
14841         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14842                 dtrace_helper_action_t *h, *next;
14843
14844                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14845                         next = h->dtha_next;
14846                         dtrace_helper_action_destroy(h, vstate);
14847                         h = next;
14848                 }
14849         }
14850
14851         lck_mtx_unlock(&dtrace_lock);
14852
14853         /*
14854          * Destroy the helper providers.
14855          */
14856         if (help->dthps_maxprovs > 0) {
14857                 lck_mtx_lock(&dtrace_meta_lock);
14858                 if (dtrace_meta_pid != NULL) {
14859                         ASSERT(dtrace_deferred_pid == NULL);
14860
14861                         for (i = 0; i < help->dthps_nprovs; i++) {
14862                                 dtrace_helper_provider_remove(
14863                                     &help->dthps_provs[i]->dthp_prov, p->p_pid);
14864                         }
14865                 } else {
14866                         lck_mtx_lock(&dtrace_lock);
14867                         ASSERT(help->dthps_deferred == 0 ||
14868                             help->dthps_next != NULL ||
14869                             help->dthps_prev != NULL ||
14870                             help == dtrace_deferred_pid);
14871
14872                         /*
14873                          * Remove the helper from the deferred list.
14874                          */
14875                         if (help->dthps_next != NULL)
14876                                 help->dthps_next->dthps_prev = help->dthps_prev;
14877                         if (help->dthps_prev != NULL)
14878                                 help->dthps_prev->dthps_next = help->dthps_next;
14879                         if (dtrace_deferred_pid == help) {
14880                                 dtrace_deferred_pid = help->dthps_next;
14881                                 ASSERT(help->dthps_prev == NULL);
14882                         }
14883
14884                         lck_mtx_unlock(&dtrace_lock);
14885                 }
14886
14887                 lck_mtx_unlock(&dtrace_meta_lock);
14888
14889                 for (i = 0; i < help->dthps_nprovs; i++) {
14890                         dtrace_helper_provider_destroy(help->dthps_provs[i]);
14891                 }
14892
14893                 kmem_free(help->dthps_provs, help->dthps_maxprovs *
14894                     sizeof (dtrace_helper_provider_t *));
14895         }
14896
14897         lck_mtx_lock(&dtrace_lock);
14898
14899         dtrace_vstate_fini(&help->dthps_vstate);
14900         kmem_free(help->dthps_actions,
14901             sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
14902         kmem_free(help, sizeof (dtrace_helpers_t));
14903
14904         --dtrace_helpers;
14905         lck_mtx_unlock(&dtrace_lock);
14906 }
14907
14908 static void
14909 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
14910 {
14911         dtrace_helpers_t *help, *newhelp;
14912         dtrace_helper_action_t *helper, *new, *last;
14913         dtrace_difo_t *dp;
14914         dtrace_vstate_t *vstate;
14915         uint_t i;
14916         int j, sz, hasprovs = 0;
14917
14918         lck_mtx_lock(&dtrace_lock);
14919         ASSERT(from->p_dtrace_helpers != NULL);
14920         ASSERT(dtrace_helpers > 0);
14921
14922         help = from->p_dtrace_helpers;
14923         newhelp = dtrace_helpers_create(to);
14924         ASSERT(to->p_dtrace_helpers != NULL);
14925
14926         newhelp->dthps_generation = help->dthps_generation;
14927         vstate = &newhelp->dthps_vstate;
14928
14929         /*
14930          * Duplicate the helper actions.
14931          */
14932         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14933                 if ((helper = help->dthps_actions[i]) == NULL)
14934                         continue;
14935
14936                 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
14937                         new = kmem_zalloc(sizeof (dtrace_helper_action_t),
14938                             KM_SLEEP);
14939                         new->dtha_generation = helper->dtha_generation;
14940
14941                         if ((dp = helper->dtha_predicate) != NULL) {
14942                                 dp = dtrace_difo_duplicate(dp, vstate);
14943                                 new->dtha_predicate = dp;
14944                         }
14945
14946                         new->dtha_nactions = helper->dtha_nactions;
14947                         sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
14948                         new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
14949
14950                         for (j = 0; j < new->dtha_nactions; j++) {
14951                                 dtrace_difo_t *dpj = helper->dtha_actions[j];
14952
14953                                 ASSERT(dpj != NULL);
14954                                 dpj = dtrace_difo_duplicate(dpj, vstate);
14955                                 new->dtha_actions[j] = dpj;
14956                         }
14957
14958                         if (last != NULL) {
14959                                 last->dtha_next = new;
14960                         } else {
14961                                 newhelp->dthps_actions[i] = new;
14962                         }
14963
14964                         last = new;
14965                 }
14966         }
14967
14968         /*
14969          * Duplicate the helper providers and register them with the
14970          * DTrace framework.
14971          */
14972         if (help->dthps_nprovs > 0) {
14973                 newhelp->dthps_nprovs = help->dthps_nprovs;
14974                 newhelp->dthps_maxprovs = help->dthps_nprovs;
14975                 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
14976                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14977                 for (i = 0; i < newhelp->dthps_nprovs; i++) {
14978                         newhelp->dthps_provs[i] = help->dthps_provs[i];
14979                         newhelp->dthps_provs[i]->dthp_ref++;
14980                 }
14981
14982                 hasprovs = 1;
14983         }
14984
14985         lck_mtx_unlock(&dtrace_lock);
14986
14987         if (hasprovs)
14988                 dtrace_helper_provider_register(to, newhelp, NULL);
14989 }
14990
14991 /*
14992  * DTrace Hook Functions
14993  */
14994
14995 /*
14996  * APPLE NOTE:  dtrace_modctl_* routines for kext support.
14997  * Used to manipulate the modctl list within dtrace xnu.
14998  */
14999
15000 modctl_t *dtrace_modctl_list;
15001
15002 static void
15003 dtrace_modctl_add(struct modctl * newctl)
15004 {
15005         struct modctl *nextp, *prevp;
15006
15007         ASSERT(newctl != NULL);
15008         lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
15009
15010         // Insert new module at the front of the list,
15011
15012         newctl->mod_next = dtrace_modctl_list;
15013         dtrace_modctl_list = newctl;
15014
15015         /*
15016          * If a module exists with the same name, then that module
15017          * must have been unloaded with enabled probes. We will move
15018          * the unloaded module to the new module's stale chain and
15019          * then stop traversing the list.
15020          */
15021
15022         prevp = newctl;
15023         nextp = newctl->mod_next;
15024
15025         while (nextp != NULL) {
15026                 if (nextp->mod_loaded) {
15027                         /* This is a loaded module. Keep traversing. */
15028                         prevp = nextp;
15029                         nextp = nextp->mod_next;
15030                         continue;
15031                 }
15032                 else {
15033                         /* Found an unloaded module */
15034                         if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
15035                                 /* Names don't match. Keep traversing. */
15036                                 prevp = nextp;
15037                                 nextp = nextp->mod_next;
15038                                 continue;
15039                         }
15040                         else {
15041                                 /* We found a stale entry, move it. We're done. */
15042                                 prevp->mod_next = nextp->mod_next;
15043                                 newctl->mod_stale = nextp;
15044                                 nextp->mod_next = NULL;
15045                                 break;
15046                         }
15047                 }
15048         }
15049 }
15050
15051 static modctl_t *
15052 dtrace_modctl_lookup(struct kmod_info * kmod)
15053 {
15054     lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
15055
15056     struct modctl * ctl;
15057
15058     for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
15059         if (ctl->mod_id == kmod->id)
15060             return(ctl);
15061     }
15062     return (NULL);
15063 }
15064
15065 /*
15066  * This routine is called from dtrace_module_unloaded().
15067  * It removes a modctl structure and its stale chain
15068  * from the kext shadow list.
15069  */
15070 static void
15071 dtrace_modctl_remove(struct modctl * ctl)
15072 {
15073         ASSERT(ctl != NULL);
15074         lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
15075         modctl_t *prevp, *nextp, *curp;
15076
15077         // Remove stale chain first
15078         for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
15079                 nextp = curp->mod_stale;
15080                 /* There should NEVER be user symbols allocated at this point */
15081                 ASSERT(curp->mod_user_symbols == NULL);
15082                 kmem_free(curp, sizeof(modctl_t));
15083         }
15084
15085         prevp = NULL;
15086         curp = dtrace_modctl_list;
15087
15088         while (curp != ctl) {
15089                 prevp = curp;
15090                 curp = curp->mod_next;
15091         }
15092
15093         if (prevp != NULL) {
15094                 prevp->mod_next = ctl->mod_next;
15095         }
15096         else {
15097                 dtrace_modctl_list = ctl->mod_next;
15098         }
15099
15100         /* There should NEVER be user symbols allocated at this point */
15101         ASSERT(ctl->mod_user_symbols == NULL);
15102
15103         kmem_free (ctl, sizeof(modctl_t));
15104 }
15105
15106 /*
15107  * APPLE NOTE: The kext loader will call dtrace_module_loaded
15108  * when the kext is loaded in memory, but before calling the
15109  * kext's start routine.
15110  *
15111  * Return 0 on success
15112  * Return -1 on failure
15113  */
15114
15115 static int
15116 dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
15117 {
15118         dtrace_provider_t *prv;
15119
15120         /*
15121          * If kernel symbols have been disabled, return immediately
15122          * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
15123          */
15124         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
15125                 return 0;
15126
15127         struct modctl *ctl = NULL;
15128         if (!kmod || kmod->address == 0 || kmod->size == 0)
15129                 return(-1);
15130
15131         lck_mtx_lock(&dtrace_provider_lock);
15132         lck_mtx_lock(&mod_lock);
15133
15134         /*
15135          * Have we seen this kext before?
15136          */
15137
15138         ctl = dtrace_modctl_lookup(kmod);
15139
15140         if (ctl != NULL) {
15141                 /* bail... we already have this kext in the modctl list */
15142                 lck_mtx_unlock(&mod_lock);
15143                 lck_mtx_unlock(&dtrace_provider_lock);
15144                 if (dtrace_err_verbose)
15145                         cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
15146                 return(-1);
15147         }
15148         else {
15149                 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
15150                 if (ctl == NULL) {
15151                         if (dtrace_err_verbose)
15152                                 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
15153                         lck_mtx_unlock(&mod_lock);
15154                         lck_mtx_unlock(&dtrace_provider_lock);
15155                         return (-1);
15156                 }
15157                 ctl->mod_next = NULL;
15158                 ctl->mod_stale = NULL;
15159                 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
15160                 ctl->mod_loadcnt = kmod->id;
15161                 ctl->mod_nenabled = 0;
15162                 ctl->mod_address  = kmod->address;
15163                 ctl->mod_size = kmod->size;
15164                 ctl->mod_id = kmod->id;
15165                 ctl->mod_loaded = 1;
15166                 ctl->mod_flags = 0;
15167                 ctl->mod_user_symbols = NULL;
15168
15169                 /*
15170                  * Find the UUID for this module, if it has one
15171                  */
15172                 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
15173                 struct load_command* load_cmd = (struct load_command *)&header[1];
15174                 uint32_t i;
15175                 for (i = 0; i < header->ncmds; i++) {
15176                         if (load_cmd->cmd == LC_UUID) {
15177                                 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
15178                                 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
15179                                 ctl->mod_flags |= MODCTL_HAS_UUID;
15180                                 break;
15181                         }
15182                         load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
15183                 }
15184
15185                 if (ctl->mod_address == g_kernel_kmod_info.address) {
15186                         ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
15187                 }
15188         }
15189         dtrace_modctl_add(ctl);
15190
15191         /*
15192          * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
15193          */
15194         lck_mtx_lock(&dtrace_lock);
15195
15196         /*
15197          * DTrace must decide if it will instrument modules lazily via
15198          * userspace symbols (default mode), or instrument immediately via
15199          * kernel symbols (non-default mode)
15200          *
15201          * When in default/lazy mode, DTrace will only support modules
15202          * built with a valid UUID.
15203          *
15204          * Overriding the default can be done explicitly in one of
15205          * the following two ways.
15206          *
15207          * A module can force symbols from kernel space using the plist key,
15208          * OSBundleForceDTraceInit (see kmod.h).  If this per kext state is set,
15209          * we fall through and instrument this module now.
15210          *
15211          * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
15212          * from kernel space (see dtrace_impl.h).  If this system state is set
15213          * to a non-userspace mode, we fall through and instrument the module now.
15214          */
15215
15216         if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
15217             (!(flag & KMOD_DTRACE_FORCE_INIT)))
15218         {
15219                 /* We will instrument the module lazily -- this is the default */
15220                 lck_mtx_unlock(&dtrace_lock);
15221                 lck_mtx_unlock(&mod_lock);
15222                 lck_mtx_unlock(&dtrace_provider_lock);
15223                 return 0;
15224         }
15225
15226         /* We will instrument the module immediately using kernel symbols */
15227         ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
15228
15229         lck_mtx_unlock(&dtrace_lock);
15230
15231         /*
15232          * We're going to call each providers per-module provide operation
15233          * specifying only this module.
15234          */
15235         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
15236                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
15237
15238         /*
15239          * APPLE NOTE: The contract with the kext loader is that once this function
15240          * has completed, it may delete kernel symbols at will.
15241          * We must set this while still holding the mod_lock.
15242          */
15243         ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
15244
15245         lck_mtx_unlock(&mod_lock);
15246         lck_mtx_unlock(&dtrace_provider_lock);
15247
15248         /*
15249          * If we have any retained enablings, we need to match against them.
15250          * Enabling probes requires that cpu_lock be held, and we cannot hold
15251          * cpu_lock here -- it is legal for cpu_lock to be held when loading a
15252          * module.  (In particular, this happens when loading scheduling
15253          * classes.)  So if we have any retained enablings, we need to dispatch
15254          * our task queue to do the match for us.
15255          */
15256         lck_mtx_lock(&dtrace_lock);
15257
15258         if (dtrace_retained == NULL) {
15259                 lck_mtx_unlock(&dtrace_lock);
15260                 return 0;
15261         }
15262
15263         /* APPLE NOTE!
15264          *
15265          * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
15266          * holds it for any reason. Thus the comment above is invalid, we can directly invoke
15267          * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
15268          * the delay call as well.
15269          */
15270         lck_mtx_unlock(&dtrace_lock);
15271
15272         dtrace_enabling_matchall();
15273
15274         return 0;
15275 }
15276
15277 /*
15278  * Return 0 on success
15279  * Return -1 on failure
15280  */
15281 static int
15282 dtrace_module_unloaded(struct kmod_info *kmod)
15283 {
15284         dtrace_probe_t template, *probe, *first, *next;
15285         dtrace_provider_t *prov;
15286         struct modctl *ctl = NULL;
15287         struct modctl *syncctl = NULL;
15288         struct modctl *nextsyncctl = NULL;
15289         int syncmode = 0;
15290
15291         lck_mtx_lock(&dtrace_provider_lock);
15292         lck_mtx_lock(&mod_lock);
15293         lck_mtx_lock(&dtrace_lock);
15294
15295         if (kmod == NULL) {
15296             syncmode = 1;
15297         }
15298         else {
15299             ctl = dtrace_modctl_lookup(kmod);
15300             if (ctl == NULL)
15301             {
15302                 lck_mtx_unlock(&dtrace_lock);
15303                 lck_mtx_unlock(&mod_lock);
15304                 lck_mtx_unlock(&dtrace_provider_lock);
15305                 return (-1);
15306             }
15307             ctl->mod_loaded = 0;
15308             ctl->mod_address = 0;
15309             ctl->mod_size = 0;
15310         }
15311
15312         if (dtrace_bymod == NULL) {
15313                 /*
15314                  * The DTrace module is loaded (obviously) but not attached;
15315                  * we don't have any work to do.
15316                  */
15317                  if (ctl != NULL)
15318                          (void)dtrace_modctl_remove(ctl);
15319                  lck_mtx_unlock(&dtrace_lock);
15320                  lck_mtx_unlock(&mod_lock);
15321                  lck_mtx_unlock(&dtrace_provider_lock);
15322                  return(0);
15323         }
15324
15325         /* Syncmode set means we target and traverse entire modctl list. */
15326         if (syncmode)
15327             nextsyncctl = dtrace_modctl_list;
15328
15329 syncloop:
15330         if (syncmode)
15331         {
15332             /* find a stale modctl struct */
15333             for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
15334                 if (syncctl->mod_address == 0)
15335                     break;
15336             }
15337             if (syncctl==NULL)
15338             {
15339                 /* We have no more work to do */
15340                 lck_mtx_unlock(&dtrace_lock);
15341                 lck_mtx_unlock(&mod_lock);
15342                 lck_mtx_unlock(&dtrace_provider_lock);
15343                 return(0);
15344             }
15345             else {
15346                 /* keep track of next syncctl in case this one is removed */
15347                 nextsyncctl = syncctl->mod_next;
15348                 ctl = syncctl;
15349             }
15350         }
15351
15352         template.dtpr_mod = ctl->mod_modname;
15353
15354         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
15355             probe != NULL; probe = probe->dtpr_nextmod) {
15356                 if (probe->dtpr_ecb != NULL) {
15357                         /*
15358                          * This shouldn't _actually_ be possible -- we're
15359                          * unloading a module that has an enabled probe in it.
15360                          * (It's normally up to the provider to make sure that
15361                          * this can't happen.)  However, because dtps_enable()
15362                          * doesn't have a failure mode, there can be an
15363                          * enable/unload race.  Upshot:  we don't want to
15364                          * assert, but we're not going to disable the
15365                          * probe, either.
15366                          */
15367
15368
15369                         if (syncmode) {
15370                             /* We're syncing, let's look at next in list */
15371                             goto syncloop;
15372                         }
15373
15374                         lck_mtx_unlock(&dtrace_lock);
15375                         lck_mtx_unlock(&mod_lock);
15376                         lck_mtx_unlock(&dtrace_provider_lock);
15377
15378                         if (dtrace_err_verbose) {
15379                                 cmn_err(CE_WARN, "unloaded module '%s' had "
15380                                     "enabled probes", ctl->mod_modname);
15381                         }
15382                         return(-1);
15383                 }
15384         }
15385
15386         probe = first;
15387
15388         for (first = NULL; probe != NULL; probe = next) {
15389                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
15390
15391                 dtrace_probes[probe->dtpr_id - 1] = NULL;
15392                 probe->dtpr_provider->dtpv_probe_count--;
15393
15394                 next = probe->dtpr_nextmod;
15395                 dtrace_hash_remove(dtrace_bymod, probe);
15396                 dtrace_hash_remove(dtrace_byfunc, probe);
15397                 dtrace_hash_remove(dtrace_byname, probe);
15398
15399                 if (first == NULL) {
15400                         first = probe;
15401                         probe->dtpr_nextmod = NULL;
15402                 } else {
15403                         probe->dtpr_nextmod = first;
15404                         first = probe;
15405                 }
15406         }
15407
15408         /*
15409          * We've removed all of the module's probes from the hash chains and
15410          * from the probe array.  Now issue a dtrace_sync() to be sure that
15411          * everyone has cleared out from any probe array processing.
15412          */
15413         dtrace_sync();
15414
15415         for (probe = first; probe != NULL; probe = first) {
15416                 first = probe->dtpr_nextmod;
15417                 prov = probe->dtpr_provider;
15418                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
15419                     probe->dtpr_arg);
15420                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
15421                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
15422                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
15423                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
15424
15425                 zfree(dtrace_probe_t_zone, probe);
15426         }
15427
15428         dtrace_modctl_remove(ctl);
15429
15430         if (syncmode)
15431             goto syncloop;
15432
15433         lck_mtx_unlock(&dtrace_lock);
15434         lck_mtx_unlock(&mod_lock);
15435         lck_mtx_unlock(&dtrace_provider_lock);
15436
15437         return(0);
15438 }
15439
15440 void
15441 dtrace_suspend(void)
15442 {
15443         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
15444 }
15445
15446 void
15447 dtrace_resume(void)
15448 {
15449         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
15450 }
15451
15452 static int
15453 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
15454 {
15455         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
15456         lck_mtx_lock(&dtrace_lock);
15457
15458         switch (what) {
15459         case CPU_CONFIG: {
15460                 dtrace_state_t *state;
15461                 dtrace_optval_t *opt, rs, c;
15462
15463                 /*
15464                  * For now, we only allocate a new buffer for anonymous state.
15465                  */
15466                 if ((state = dtrace_anon.dta_state) == NULL)
15467                         break;
15468
15469                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
15470                         break;
15471
15472                 opt = state->dts_options;
15473                 c = opt[DTRACEOPT_CPU];
15474
15475                 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
15476                         break;
15477
15478                 /*
15479                  * Regardless of what the actual policy is, we're going to
15480                  * temporarily set our resize policy to be manual.  We're
15481                  * also going to temporarily set our CPU option to denote
15482                  * the newly configured CPU.
15483                  */
15484                 rs = opt[DTRACEOPT_BUFRESIZE];
15485                 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
15486                 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
15487
15488                 (void) dtrace_state_buffers(state);
15489
15490                 opt[DTRACEOPT_BUFRESIZE] = rs;
15491                 opt[DTRACEOPT_CPU] = c;
15492
15493                 break;
15494         }
15495
15496         case CPU_UNCONFIG:
15497                 /*
15498                  * We don't free the buffer in the CPU_UNCONFIG case.  (The
15499                  * buffer will be freed when the consumer exits.)
15500                  */
15501                 break;
15502
15503         default:
15504                 break;
15505         }
15506
15507         lck_mtx_unlock(&dtrace_lock);
15508         return (0);
15509 }
15510
15511 static void
15512 dtrace_cpu_setup_initial(processorid_t cpu)
15513 {
15514         (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
15515 }
15516
15517 static void
15518 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
15519 {
15520         if (dtrace_toxranges >= dtrace_toxranges_max) {
15521                 int osize, nsize;
15522                 dtrace_toxrange_t *range;
15523
15524                 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
15525
15526                 if (osize == 0) {
15527                         ASSERT(dtrace_toxrange == NULL);
15528                         ASSERT(dtrace_toxranges_max == 0);
15529                         dtrace_toxranges_max = 1;
15530                 } else {
15531                         dtrace_toxranges_max <<= 1;
15532                 }
15533
15534                 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
15535                 range = kmem_zalloc(nsize, KM_SLEEP);
15536
15537                 if (dtrace_toxrange != NULL) {
15538                         ASSERT(osize != 0);
15539                         bcopy(dtrace_toxrange, range, osize);
15540                         kmem_free(dtrace_toxrange, osize);
15541                 }
15542
15543                 dtrace_toxrange = range;
15544         }
15545
15546         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
15547         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
15548
15549         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
15550         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
15551         dtrace_toxranges++;
15552 }
15553
15554 /*
15555  * DTrace Driver Cookbook Functions
15556  */
15557 /*ARGSUSED*/
15558 static int
15559 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
15560 {
15561 #pragma unused(cmd) /* __APPLE__ */
15562         dtrace_provider_id_t id;
15563         dtrace_state_t *state = NULL;
15564         dtrace_enabling_t *enab;
15565
15566         lck_mtx_lock(&cpu_lock);
15567         lck_mtx_lock(&dtrace_provider_lock);
15568         lck_mtx_lock(&dtrace_lock);
15569
15570         if (ddi_soft_state_init(&dtrace_softstate,
15571             sizeof (dtrace_state_t), 0) != 0) {
15572                 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
15573                 lck_mtx_unlock(&dtrace_lock);
15574                 lck_mtx_unlock(&dtrace_provider_lock);
15575                 lck_mtx_unlock(&cpu_lock);
15576                 return (DDI_FAILURE);
15577         }
15578
15579         /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
15580
15581         ddi_report_dev(devi);
15582         dtrace_devi = devi;
15583
15584         dtrace_modload = dtrace_module_loaded;
15585         dtrace_modunload = dtrace_module_unloaded;
15586         dtrace_cpu_init = dtrace_cpu_setup_initial;
15587         dtrace_helpers_cleanup = dtrace_helpers_destroy;
15588         dtrace_helpers_fork = dtrace_helpers_duplicate;
15589         dtrace_cpustart_init = dtrace_suspend;
15590         dtrace_cpustart_fini = dtrace_resume;
15591         dtrace_debugger_init = dtrace_suspend;
15592         dtrace_debugger_fini = dtrace_resume;
15593
15594         register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
15595
15596         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
15597
15598         dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
15599             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
15600         dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
15601             UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
15602             VM_SLEEP | VMC_IDENTIFIER);
15603         dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
15604             1, INT_MAX, 0);
15605
15606         dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
15607             sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
15608             NULL, NULL, NULL, NULL, NULL, 0);
15609
15610         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
15611         dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
15612             offsetof(dtrace_probe_t, dtpr_nextmod),
15613             offsetof(dtrace_probe_t, dtpr_prevmod));
15614
15615         dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
15616             offsetof(dtrace_probe_t, dtpr_nextfunc),
15617             offsetof(dtrace_probe_t, dtpr_prevfunc));
15618
15619         dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
15620             offsetof(dtrace_probe_t, dtpr_nextname),
15621             offsetof(dtrace_probe_t, dtpr_prevname));
15622
15623         if (dtrace_retain_max < 1) {
15624                 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
15625                     "setting to 1", dtrace_retain_max);
15626                 dtrace_retain_max = 1;
15627         }
15628
15629         /*
15630          * Now discover our toxic ranges.
15631          */
15632         dtrace_toxic_ranges(dtrace_toxrange_add);
15633
15634         /*
15635          * Before we register ourselves as a provider to our own framework,
15636          * we would like to assert that dtrace_provider is NULL -- but that's
15637          * not true if we were loaded as a dependency of a DTrace provider.
15638          * Once we've registered, we can assert that dtrace_provider is our
15639          * pseudo provider.
15640          */
15641         (void) dtrace_register("dtrace", &dtrace_provider_attr,
15642             DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
15643
15644         ASSERT(dtrace_provider != NULL);
15645         ASSERT((dtrace_provider_id_t)dtrace_provider == id);
15646
15647 #if defined (__x86_64__)
15648         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
15649             dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
15650         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
15651             dtrace_provider, NULL, NULL, "END", 0, NULL);
15652         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
15653             dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
15654 #else
15655 #error Unknown Architecture
15656 #endif
15657
15658         dtrace_anon_property();
15659         lck_mtx_unlock(&cpu_lock);
15660
15661         /*
15662          * If DTrace helper tracing is enabled, we need to allocate the
15663          * trace buffer and initialize the values.
15664          */
15665         if (dtrace_helptrace_enabled) {
15666                 ASSERT(dtrace_helptrace_buffer == NULL);
15667                 dtrace_helptrace_buffer =
15668                     kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
15669                 dtrace_helptrace_next = 0;
15670         }
15671
15672         /*
15673          * If there are already providers, we must ask them to provide their
15674          * probes, and then match any anonymous enabling against them.  Note
15675          * that there should be no other retained enablings at this time:
15676          * the only retained enablings at this time should be the anonymous
15677          * enabling.
15678          */
15679         if (dtrace_anon.dta_enabling != NULL) {
15680                 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
15681
15682                 /*
15683                  * APPLE NOTE: if handling anonymous dof, switch symbol modes.
15684                  */
15685                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
15686                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
15687                 }
15688
15689                 dtrace_enabling_provide(NULL);
15690                 state = dtrace_anon.dta_state;
15691
15692                 /*
15693                  * We couldn't hold cpu_lock across the above call to
15694                  * dtrace_enabling_provide(), but we must hold it to actually
15695                  * enable the probes.  We have to drop all of our locks, pick
15696                  * up cpu_lock, and regain our locks before matching the
15697                  * retained anonymous enabling.
15698                  */
15699                 lck_mtx_unlock(&dtrace_lock);
15700                 lck_mtx_unlock(&dtrace_provider_lock);
15701
15702                 lck_mtx_lock(&cpu_lock);
15703                 lck_mtx_lock(&dtrace_provider_lock);
15704                 lck_mtx_lock(&dtrace_lock);
15705
15706                 if ((enab = dtrace_anon.dta_enabling) != NULL)
15707                         (void) dtrace_enabling_match(enab, NULL);
15708
15709                 lck_mtx_unlock(&cpu_lock);
15710         }
15711
15712         lck_mtx_unlock(&dtrace_lock);
15713         lck_mtx_unlock(&dtrace_provider_lock);
15714
15715         if (state != NULL) {
15716                 /*
15717                  * If we created any anonymous state, set it going now.
15718                  */
15719                 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
15720         }
15721
15722         return (DDI_SUCCESS);
15723 }
15724
15725 /*ARGSUSED*/
15726 static int
15727 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
15728 {
15729 #pragma unused(flag, otyp)
15730         dtrace_state_t *state;
15731         uint32_t priv;
15732         uid_t uid;
15733         zoneid_t zoneid;
15734         int rv;
15735
15736         /* APPLE: Darwin puts Helper on its own major device. */
15737
15738         /*
15739          * If no DTRACE_PRIV_* bits are set in the credential, then the
15740          * caller lacks sufficient permission to do anything with DTrace.
15741          */
15742         dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
15743         if (priv == DTRACE_PRIV_NONE)
15744                 return (EACCES);
15745
15746         /*
15747          * APPLE NOTE: We delay the initialization of fasttrap as late as possible.
15748          * It certainly can't be later than now!
15749          */
15750         fasttrap_init();
15751
15752         /*
15753          * Ask all providers to provide all their probes.
15754          */
15755         lck_mtx_lock(&dtrace_provider_lock);
15756         dtrace_probe_provide(NULL, NULL);
15757         lck_mtx_unlock(&dtrace_provider_lock);
15758
15759         lck_mtx_lock(&cpu_lock);
15760         lck_mtx_lock(&dtrace_lock);
15761         dtrace_opens++;
15762         dtrace_membar_producer();
15763
15764         /*
15765          * If the kernel debugger is active (that is, if the kernel debugger
15766          * modified text in some way), we won't allow the open.
15767          */
15768         if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15769                 dtrace_opens--;
15770                 lck_mtx_unlock(&dtrace_lock);
15771                 lck_mtx_unlock(&cpu_lock);
15772                 return (EBUSY);
15773         }
15774
15775         rv = dtrace_state_create(devp, cred_p, &state);
15776         lck_mtx_unlock(&cpu_lock);
15777
15778         if (rv != 0 || state == NULL) {
15779                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15780                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15781                 lck_mtx_unlock(&dtrace_lock);
15782                 /* propagate EAGAIN or ERESTART */
15783                 return (rv);
15784         }
15785
15786         lck_mtx_unlock(&dtrace_lock);
15787
15788         lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
15789
15790         /*
15791          * If we are currently lazy, transition states.
15792          *
15793          * Unlike dtrace_close, we do not need to check the
15794          * value of dtrace_opens, as any positive value (and
15795          * we count as 1) means we transition states.
15796          */
15797         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
15798                 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
15799
15800                 /*
15801                  * Iterate all existing processes and load lazy dofs.
15802                  */
15803                 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
15804                              dtrace_lazy_dofs_proc_iterate_doit,
15805                              NULL,
15806                              dtrace_lazy_dofs_proc_iterate_filter,
15807                              NULL);
15808         }
15809
15810         lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
15811
15812         /*
15813          * Update kernel symbol state.
15814          *
15815          * We must own the provider and dtrace locks.
15816          *
15817          * NOTE! It may appear there is a race by setting this value so late
15818          * after dtrace_probe_provide. However, any kext loaded after the
15819          * call to probe provide and before we set LAZY_OFF will be marked as
15820          * eligible for symbols from userspace. The same dtrace that is currently
15821          * calling dtrace_open() (this call!) will get a list of kexts needing
15822          * symbols and fill them in, thus closing the race window.
15823          *
15824          * We want to set this value only after it certain it will succeed, as
15825          * this significantly reduces the complexity of error exits.
15826          */
15827         lck_mtx_lock(&dtrace_lock);
15828         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
15829                 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
15830         }
15831         lck_mtx_unlock(&dtrace_lock);
15832
15833         return (0);
15834 }
15835
15836 /*ARGSUSED*/
15837 static int
15838 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
15839 {
15840 #pragma unused(flag, otyp, cred_p) /* __APPLE__ */
15841         minor_t minor = getminor(dev);
15842         dtrace_state_t *state;
15843
15844         /* APPLE NOTE: Darwin puts Helper on its own major device. */
15845
15846         state = ddi_get_soft_state(dtrace_softstate, minor);
15847
15848         lck_mtx_lock(&cpu_lock);
15849         lck_mtx_lock(&dtrace_lock);
15850
15851         if (state->dts_anon) {
15852                 /*
15853                  * There is anonymous state. Destroy that first.
15854                  */
15855                 ASSERT(dtrace_anon.dta_state == NULL);
15856                 dtrace_state_destroy(state->dts_anon);
15857         }
15858
15859         dtrace_state_destroy(state);
15860         ASSERT(dtrace_opens > 0);
15861
15862         /*
15863          * Only relinquish control of the kernel debugger interface when there
15864          * are no consumers and no anonymous enablings.
15865          */
15866         if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15867                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15868
15869         lck_mtx_unlock(&dtrace_lock);
15870         lck_mtx_unlock(&cpu_lock);
15871
15872         /*
15873          * Lock ordering requires the dof mode lock be taken before
15874          * the dtrace_lock.
15875          */
15876         lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
15877         lck_mtx_lock(&dtrace_lock);
15878
15879         if (dtrace_opens == 0) {
15880                 /*
15881                  * If we are currently lazy-off, and this is the last close, transition to
15882                  * lazy state.
15883                  */
15884                 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
15885                         dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
15886                 }
15887
15888                 /*
15889                  * If we are the last dtrace client, switch back to lazy (from userspace) symbols
15890                  */
15891                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
15892                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
15893                 }
15894         }
15895
15896         lck_mtx_unlock(&dtrace_lock);
15897         lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
15898
15899         /*
15900          * Kext probes may be retained past the end of the kext's lifespan. The
15901          * probes are kept until the last reference to them has been removed.
15902          * Since closing an active dtrace context is likely to drop that last reference,
15903          * lets take a shot at cleaning out the orphaned probes now.
15904          */
15905         dtrace_module_unloaded(NULL);
15906
15907         return (0);
15908 }
15909
15910 /*ARGSUSED*/
15911 static int
15912 dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
15913 {
15914 #pragma unused(rv)
15915         /*
15916          * Safe to check this outside the dof mode lock
15917          */
15918         if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
15919                 return KERN_SUCCESS;
15920
15921         switch (cmd) {
15922         case DTRACEHIOC_ADDDOF:
15923                         {
15924                         dof_helper_t *dhp = NULL;
15925                         size_t dof_ioctl_data_size;
15926                         dof_ioctl_data_t* multi_dof;
15927                         unsigned int i;
15928                         int rval = 0;
15929                         user_addr_t user_address = *(user_addr_t*)arg;
15930                         uint64_t dof_count;
15931                         int multi_dof_claimed = 0;
15932                         proc_t* p = current_proc();
15933
15934                         /*
15935                          * Read the number of DOF sections being passed in.
15936                          */
15937                         if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
15938                                    &dof_count,
15939                                    sizeof(dof_count))) {
15940                                 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
15941                                 return (EFAULT);
15942                         }
15943
15944                         /*
15945                          * Range check the count.
15946                          */
15947                         if (dof_count == 0 || dof_count > 1024) {
15948                                 dtrace_dof_error(NULL, "dofiod_count is not valid");
15949                                 return (EINVAL);
15950                         }
15951
15952                         /*
15953                          * Allocate a correctly sized structure and copyin the data.
15954                          */
15955                         dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
15956                         if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
15957                                 return (ENOMEM);
15958
15959                         /* NOTE! We can no longer exit this method via return */
15960                         if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
15961                                 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
15962                                 rval = EFAULT;
15963                                 goto cleanup;
15964                         }
15965
15966                         /*
15967                          * Check that the count didn't change between the first copyin and the second.
15968                          */
15969                         if (multi_dof->dofiod_count != dof_count) {
15970                                 rval = EINVAL;
15971                                 goto cleanup;
15972                         }
15973
15974                         /*
15975                          * Try to process lazily first.
15976                          */
15977                         rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
15978
15979                         /*
15980                          * If rval is EACCES, we must be non-lazy.
15981                          */
15982                         if (rval == EACCES) {
15983                                 rval = 0;
15984                                 /*
15985                                  * Process each dof_helper_t
15986                                  */
15987                                 i = 0;
15988                                 do {
15989                                         dhp = &multi_dof->dofiod_helpers[i];
15990
15991                                         dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
15992
15993                                         if (dof != NULL) {
15994                                                 lck_mtx_lock(&dtrace_lock);
15995
15996                                                 /*
15997                                                  * dtrace_helper_slurp() takes responsibility for the dof --
15998                                                  * it may free it now or it may save it and free it later.
15999                                                  */
16000                                                 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
16001                                                         rval = EINVAL;
16002                                                 }
16003
16004                                                 lck_mtx_unlock(&dtrace_lock);
16005                                         }
16006                                 } while (++i < multi_dof->dofiod_count && rval == 0);
16007                         }
16008
16009                         /*
16010                          * We need to copyout the multi_dof struct, because it contains
16011                          * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
16012                          *
16013                          * This could certainly be better optimized.
16014                          */
16015                         if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
16016                                 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
16017                                 /* Don't overwrite pre-existing error code */
16018                                 if (rval == 0) rval = EFAULT;
16019                         }
16020
16021                 cleanup:
16022                         /*
16023                          * If we had to allocate struct memory, free it.
16024                          */
16025                         if (multi_dof != NULL && !multi_dof_claimed) {
16026                                 kmem_free(multi_dof, dof_ioctl_data_size);
16027                         }
16028
16029                         return rval;
16030                 }
16031
16032                 case DTRACEHIOC_REMOVE: {
16033                         int generation = *(int*)arg;
16034                         proc_t* p = current_proc();
16035
16036                         /*
16037                          * Try lazy first.
16038                          */
16039                         int rval = dtrace_lazy_dofs_remove(p, generation);
16040
16041                         /*
16042                          * EACCES means non-lazy
16043                          */
16044                         if (rval == EACCES) {
16045                                 lck_mtx_lock(&dtrace_lock);
16046                                 rval = dtrace_helper_destroygen(p, generation);
16047                                 lck_mtx_unlock(&dtrace_lock);
16048                         }
16049
16050                         return (rval);
16051                 }
16052
16053                 default:
16054                         break;
16055         }
16056
16057         return ENOTTY;
16058 }
16059
16060 /*ARGSUSED*/
16061 static int
16062 dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
16063 {
16064 #pragma unused(md)
16065         minor_t minor = getminor(dev);
16066         dtrace_state_t *state;
16067         int rval;
16068
16069         /* Darwin puts Helper on its own major device. */
16070
16071         state = ddi_get_soft_state(dtrace_softstate, minor);
16072
16073         if (state->dts_anon) {
16074            ASSERT(dtrace_anon.dta_state == NULL);
16075            state = state->dts_anon;
16076         }
16077
16078         switch (cmd) {
16079         case DTRACEIOC_PROVIDER: {
16080                 dtrace_providerdesc_t pvd;
16081                 dtrace_provider_t *pvp;
16082
16083                 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
16084                         return (EFAULT);
16085
16086                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
16087                 lck_mtx_lock(&dtrace_provider_lock);
16088
16089                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
16090                         if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
16091                                 break;
16092                 }
16093
16094                 lck_mtx_unlock(&dtrace_provider_lock);
16095
16096                 if (pvp == NULL)
16097                         return (ESRCH);
16098
16099                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
16100                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
16101                 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
16102                         return (EFAULT);
16103
16104                 return (0);
16105         }
16106
16107         case DTRACEIOC_EPROBE: {
16108                 dtrace_eprobedesc_t epdesc;
16109                 dtrace_ecb_t *ecb;
16110                 dtrace_action_t *act;
16111                 void *buf;
16112                 size_t size;
16113                 uintptr_t dest;
16114                 int nrecs;
16115
16116                 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
16117                         return (EFAULT);
16118
16119                 lck_mtx_lock(&dtrace_lock);
16120
16121                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
16122                         lck_mtx_unlock(&dtrace_lock);
16123                         return (EINVAL);
16124                 }
16125
16126                 if (ecb->dte_probe == NULL) {
16127                         lck_mtx_unlock(&dtrace_lock);
16128                         return (EINVAL);
16129                 }
16130
16131                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
16132                 epdesc.dtepd_uarg = ecb->dte_uarg;
16133                 epdesc.dtepd_size = ecb->dte_size;
16134
16135                 nrecs = epdesc.dtepd_nrecs;
16136                 epdesc.dtepd_nrecs = 0;
16137                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16138                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16139                                 continue;
16140
16141                         epdesc.dtepd_nrecs++;
16142                 }
16143
16144                 /*
16145                  * Now that we have the size, we need to allocate a temporary
16146                  * buffer in which to store the complete description.  We need
16147                  * the temporary buffer to be able to drop dtrace_lock()
16148                  * across the copyout(), below.
16149                  */
16150                 size = sizeof (dtrace_eprobedesc_t) +
16151                         (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
16152
16153                 buf = kmem_alloc(size, KM_SLEEP);
16154                 dest = (uintptr_t)buf;
16155
16156                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
16157                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
16158
16159                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16160                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16161                                 continue;
16162
16163                         if (nrecs-- == 0)
16164                                 break;
16165
16166                         bcopy(&act->dta_rec, (void *)dest,
16167                         sizeof (dtrace_recdesc_t));
16168                         dest += sizeof (dtrace_recdesc_t);
16169                 }
16170
16171                 lck_mtx_unlock(&dtrace_lock);
16172
16173                 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
16174                         kmem_free(buf, size);
16175                         return (EFAULT);
16176                 }
16177
16178                 kmem_free(buf, size);
16179                 return (0);
16180         }
16181
16182         case DTRACEIOC_AGGDESC: {
16183                 dtrace_aggdesc_t aggdesc;
16184                 dtrace_action_t *act;
16185                 dtrace_aggregation_t *agg;
16186                 int nrecs;
16187                 uint32_t offs;
16188                 dtrace_recdesc_t *lrec;
16189                 void *buf;
16190                 size_t size;
16191                 uintptr_t dest;
16192
16193                 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
16194                         return (EFAULT);
16195
16196                 lck_mtx_lock(&dtrace_lock);
16197
16198                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
16199                         lck_mtx_unlock(&dtrace_lock);
16200                         return (EINVAL);
16201                 }
16202
16203                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
16204
16205                 nrecs = aggdesc.dtagd_nrecs;
16206                 aggdesc.dtagd_nrecs = 0;
16207
16208                 offs = agg->dtag_base;
16209                 lrec = &agg->dtag_action.dta_rec;
16210                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
16211
16212                 for (act = agg->dtag_first; ; act = act->dta_next) {
16213                         ASSERT(act->dta_intuple ||
16214                         DTRACEACT_ISAGG(act->dta_kind));
16215
16216                         /*
16217                          * If this action has a record size of zero, it
16218                          * denotes an argument to the aggregating action.
16219                          * Because the presence of this record doesn't (or
16220                          * shouldn't) affect the way the data is interpreted,
16221                          * we don't copy it out to save user-level the
16222                          * confusion of dealing with a zero-length record.
16223                          */
16224                         if (act->dta_rec.dtrd_size == 0) {
16225                                 ASSERT(agg->dtag_hasarg);
16226                                 continue;
16227                         }
16228
16229                         aggdesc.dtagd_nrecs++;
16230
16231                         if (act == &agg->dtag_action)
16232                                 break;
16233                 }
16234
16235                 /*
16236                  * Now that we have the size, we need to allocate a temporary
16237                  * buffer in which to store the complete description.  We need
16238                  * the temporary buffer to be able to drop dtrace_lock()
16239                  * across the copyout(), below.
16240                  */
16241                 size = sizeof (dtrace_aggdesc_t) +
16242                         (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
16243
16244                 buf = kmem_alloc(size, KM_SLEEP);
16245                 dest = (uintptr_t)buf;
16246
16247                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
16248                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
16249
16250                 for (act = agg->dtag_first; ; act = act->dta_next) {
16251                         dtrace_recdesc_t rec = act->dta_rec;
16252
16253                         /*
16254                          * See the comment in the above loop for why we pass
16255                          * over zero-length records.
16256                          */
16257                         if (rec.dtrd_size == 0) {
16258                                 ASSERT(agg->dtag_hasarg);
16259                                 continue;
16260                         }
16261
16262                         if (nrecs-- == 0)
16263                                 break;
16264
16265                         rec.dtrd_offset -= offs;
16266                         bcopy(&rec, (void *)dest, sizeof (rec));
16267                         dest += sizeof (dtrace_recdesc_t);
16268
16269                         if (act == &agg->dtag_action)
16270                                 break;
16271                 }
16272
16273                 lck_mtx_unlock(&dtrace_lock);
16274
16275                 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
16276                         kmem_free(buf, size);
16277                         return (EFAULT);
16278                 }
16279
16280                 kmem_free(buf, size);
16281                 return (0);
16282         }
16283
16284         case DTRACEIOC_ENABLE: {
16285                 dof_hdr_t *dof;
16286                 dtrace_enabling_t *enab = NULL;
16287                 dtrace_vstate_t *vstate;
16288                 int err = 0;
16289
16290                 *rv = 0;
16291
16292                 /*
16293                  * If a NULL argument has been passed, we take this as our
16294                  * cue to reevaluate our enablings.
16295                  */
16296                 if (arg == 0) {
16297                         dtrace_enabling_matchall();
16298
16299                         return (0);
16300                 }
16301
16302                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
16303                         return (rval);
16304
16305                 lck_mtx_lock(&cpu_lock);
16306                 lck_mtx_lock(&dtrace_lock);
16307                 vstate = &state->dts_vstate;
16308
16309                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
16310                         lck_mtx_unlock(&dtrace_lock);
16311                         lck_mtx_unlock(&cpu_lock);
16312                         dtrace_dof_destroy(dof);
16313                         return (EBUSY);
16314                 }
16315
16316                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
16317                         lck_mtx_unlock(&dtrace_lock);
16318                         lck_mtx_unlock(&cpu_lock);
16319                         dtrace_dof_destroy(dof);
16320                         return (EINVAL);
16321                 }
16322
16323                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
16324                         dtrace_enabling_destroy(enab);
16325                         lck_mtx_unlock(&dtrace_lock);
16326                         lck_mtx_unlock(&cpu_lock);
16327                         dtrace_dof_destroy(dof);
16328                         return (rval);
16329                 }
16330
16331                 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
16332                         err = dtrace_enabling_retain(enab);
16333                 } else {
16334                         dtrace_enabling_destroy(enab);
16335                 }
16336
16337                 lck_mtx_unlock(&dtrace_lock);
16338                 lck_mtx_unlock(&cpu_lock);
16339                 dtrace_dof_destroy(dof);
16340
16341                 return (err);
16342         }
16343
16344         case DTRACEIOC_REPLICATE: {
16345                 dtrace_repldesc_t desc;
16346                 dtrace_probedesc_t *match = &desc.dtrpd_match;
16347                 dtrace_probedesc_t *create = &desc.dtrpd_create;
16348                 int err;
16349
16350                 if (copyin(arg, &desc, sizeof (desc)) != 0)
16351                         return (EFAULT);
16352
16353                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16354                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16355                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16356                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16357
16358                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16359                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16360                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16361                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16362
16363                 lck_mtx_lock(&dtrace_lock);
16364                 err = dtrace_enabling_replicate(state, match, create);
16365                 lck_mtx_unlock(&dtrace_lock);
16366
16367                 return (err);
16368         }
16369
16370         case DTRACEIOC_PROBEMATCH:
16371         case DTRACEIOC_PROBES: {
16372                 dtrace_probe_t *probe = NULL;
16373                 dtrace_probedesc_t desc;
16374                 dtrace_probekey_t pkey;
16375                 dtrace_id_t i;
16376                 int m = 0;
16377                 uint32_t priv;
16378                 uid_t uid;
16379                 zoneid_t zoneid;
16380
16381                 if (copyin(arg, &desc, sizeof (desc)) != 0)
16382                         return (EFAULT);
16383
16384                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16385                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16386                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16387                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16388
16389                 /*
16390                  * Before we attempt to match this probe, we want to give
16391                  * all providers the opportunity to provide it.
16392                  */
16393                 if (desc.dtpd_id == DTRACE_IDNONE) {
16394                         lck_mtx_lock(&dtrace_provider_lock);
16395                         dtrace_probe_provide(&desc, NULL);
16396                         lck_mtx_unlock(&dtrace_provider_lock);
16397                         desc.dtpd_id++;
16398                 }
16399
16400                 if (cmd == DTRACEIOC_PROBEMATCH)  {
16401                         dtrace_probekey(&desc, &pkey);
16402                         pkey.dtpk_id = DTRACE_IDNONE;
16403                 }
16404
16405                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
16406
16407                 lck_mtx_lock(&dtrace_lock);
16408
16409                 if (cmd == DTRACEIOC_PROBEMATCH) {
16410                         /* Quiet compiler warning */
16411                         for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
16412                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
16413                                         (m = dtrace_match_probe(probe, &pkey,
16414                                         priv, uid, zoneid)) != 0)
16415                                         break;
16416                         }
16417
16418                         if (m < 0) {
16419                                 lck_mtx_unlock(&dtrace_lock);
16420                                 return (EINVAL);
16421                         }
16422
16423                 } else {
16424                         /* Quiet compiler warning */
16425                         for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
16426                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
16427                                         dtrace_match_priv(probe, priv, uid, zoneid))
16428                                         break;
16429                         }
16430                 }
16431
16432                 if (probe == NULL) {
16433                         lck_mtx_unlock(&dtrace_lock);
16434                         return (ESRCH);
16435                 }
16436
16437                 dtrace_probe_description(probe, &desc);
16438                 lck_mtx_unlock(&dtrace_lock);
16439
16440                 if (copyout(&desc, arg, sizeof (desc)) != 0)
16441                         return (EFAULT);
16442
16443                 return (0);
16444         }
16445
16446         case DTRACEIOC_PROBEARG: {
16447                 dtrace_argdesc_t desc;
16448                 dtrace_probe_t *probe;
16449                 dtrace_provider_t *prov;
16450
16451                 if (copyin(arg, &desc, sizeof (desc)) != 0)
16452                         return (EFAULT);
16453
16454                 if (desc.dtargd_id == DTRACE_IDNONE)
16455                         return (EINVAL);
16456
16457                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
16458                         return (EINVAL);
16459
16460                 lck_mtx_lock(&dtrace_provider_lock);
16461                 lck_mtx_lock(&mod_lock);
16462                 lck_mtx_lock(&dtrace_lock);
16463
16464                 /* Quiet compiler warning */
16465                 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
16466                         lck_mtx_unlock(&dtrace_lock);
16467                         lck_mtx_unlock(&mod_lock);
16468                         lck_mtx_unlock(&dtrace_provider_lock);
16469                         return (EINVAL);
16470                 }
16471
16472                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
16473                         lck_mtx_unlock(&dtrace_lock);
16474                         lck_mtx_unlock(&mod_lock);
16475                         lck_mtx_unlock(&dtrace_provider_lock);
16476                         return (EINVAL);
16477                 }
16478
16479                 lck_mtx_unlock(&dtrace_lock);
16480
16481                 prov = probe->dtpr_provider;
16482
16483                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
16484                 /*
16485                  * There isn't any typed information for this probe.
16486                  * Set the argument number to DTRACE_ARGNONE.
16487                  */
16488                         desc.dtargd_ndx = DTRACE_ARGNONE;
16489                 } else {
16490                         desc.dtargd_native[0] = '\0';
16491                         desc.dtargd_xlate[0] = '\0';
16492                         desc.dtargd_mapping = desc.dtargd_ndx;
16493
16494                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
16495                         probe->dtpr_id, probe->dtpr_arg, &desc);
16496                 }
16497
16498                 lck_mtx_unlock(&mod_lock);
16499                 lck_mtx_unlock(&dtrace_provider_lock);
16500
16501                 if (copyout(&desc, arg, sizeof (desc)) != 0)
16502                         return (EFAULT);
16503
16504                 return (0);
16505         }
16506
16507         case DTRACEIOC_GO: {
16508                 processorid_t cpuid;
16509                 rval = dtrace_state_go(state, &cpuid);
16510
16511                 if (rval != 0)
16512                         return (rval);
16513
16514                 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
16515                         return (EFAULT);
16516
16517                 return (0);
16518         }
16519
16520         case DTRACEIOC_STOP: {
16521                 processorid_t cpuid;
16522
16523                 lck_mtx_lock(&dtrace_lock);
16524                 rval = dtrace_state_stop(state, &cpuid);
16525                 lck_mtx_unlock(&dtrace_lock);
16526
16527                 if (rval != 0)
16528                         return (rval);
16529
16530                 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
16531                         return (EFAULT);
16532
16533                 return (0);
16534         }
16535
16536         case DTRACEIOC_DOFGET: {
16537                 dof_hdr_t hdr, *dof;
16538                 uint64_t len;
16539
16540                 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
16541                         return (EFAULT);
16542
16543                 lck_mtx_lock(&dtrace_lock);
16544                 dof = dtrace_dof_create(state);
16545                 lck_mtx_unlock(&dtrace_lock);
16546
16547                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
16548                 rval = copyout(dof, arg, len);
16549                 dtrace_dof_destroy(dof);
16550
16551                 return (rval == 0 ? 0 : EFAULT);
16552         }
16553
16554         case DTRACEIOC_AGGSNAP:
16555         case DTRACEIOC_BUFSNAP: {
16556                 dtrace_bufdesc_t desc;
16557                 caddr_t cached;
16558                 dtrace_buffer_t *buf;
16559
16560                 if (copyin(arg, &desc, sizeof (desc)) != 0)
16561                         return (EFAULT);
16562
16563                 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
16564                         return (EINVAL);
16565
16566                 lck_mtx_lock(&dtrace_lock);
16567
16568                 if (cmd == DTRACEIOC_BUFSNAP) {
16569                         buf = &state->dts_buffer[desc.dtbd_cpu];
16570                 } else {
16571                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
16572                 }
16573
16574                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
16575                         size_t sz = buf->dtb_offset;
16576
16577                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
16578                                 lck_mtx_unlock(&dtrace_lock);
16579                                 return (EBUSY);
16580                         }
16581
16582                         /*
16583                          * If this buffer has already been consumed, we're
16584                          * going to indicate that there's nothing left here
16585                          * to consume.
16586                          */
16587                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
16588                                 lck_mtx_unlock(&dtrace_lock);
16589
16590                                 desc.dtbd_size = 0;
16591                                 desc.dtbd_drops = 0;
16592                                 desc.dtbd_errors = 0;
16593                                 desc.dtbd_oldest = 0;
16594                                 sz = sizeof (desc);
16595
16596                                 if (copyout(&desc, arg, sz) != 0)
16597                                         return (EFAULT);
16598
16599                                 return (0);
16600                         }
16601
16602                         /*
16603                          * If this is a ring buffer that has wrapped, we want
16604                          * to copy the whole thing out.
16605                          */
16606                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
16607                                 dtrace_buffer_polish(buf);
16608                                 sz = buf->dtb_size;
16609                         }
16610
16611                         if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
16612                                 lck_mtx_unlock(&dtrace_lock);
16613                                 return (EFAULT);
16614                         }
16615
16616                         desc.dtbd_size = sz;
16617                         desc.dtbd_drops = buf->dtb_drops;
16618                         desc.dtbd_errors = buf->dtb_errors;
16619                         desc.dtbd_oldest = buf->dtb_xamot_offset;
16620
16621                         lck_mtx_unlock(&dtrace_lock);
16622
16623                         if (copyout(&desc, arg, sizeof (desc)) != 0)
16624                                 return (EFAULT);
16625
16626                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
16627
16628                         return (0);
16629                 }
16630
16631                 if (buf->dtb_tomax == NULL) {
16632                         ASSERT(buf->dtb_xamot == NULL);
16633                         lck_mtx_unlock(&dtrace_lock);
16634                         return (ENOENT);
16635                 }
16636
16637                 cached = buf->dtb_tomax;
16638                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
16639
16640                 dtrace_xcall(desc.dtbd_cpu,
16641                         (dtrace_xcall_t)dtrace_buffer_switch, buf);
16642
16643                 state->dts_errors += buf->dtb_xamot_errors;
16644
16645                 /*
16646                 * If the buffers did not actually switch, then the cross call
16647                 * did not take place -- presumably because the given CPU is
16648                 * not in the ready set.  If this is the case, we'll return
16649                 * ENOENT.
16650                 */
16651                 if (buf->dtb_tomax == cached) {
16652                         ASSERT(buf->dtb_xamot != cached);
16653                         lck_mtx_unlock(&dtrace_lock);
16654                         return (ENOENT);
16655                 }
16656
16657                 ASSERT(cached == buf->dtb_xamot);
16658
16659                 /*
16660                 * We have our snapshot; now copy it out.
16661                 */
16662                 if (copyout(buf->dtb_xamot, (user_addr_t)desc.dtbd_data,
16663                                         buf->dtb_xamot_offset) != 0) {
16664                         lck_mtx_unlock(&dtrace_lock);
16665                         return (EFAULT);
16666                 }
16667
16668                 desc.dtbd_size = buf->dtb_xamot_offset;
16669                 desc.dtbd_drops = buf->dtb_xamot_drops;
16670                 desc.dtbd_errors = buf->dtb_xamot_errors;
16671                 desc.dtbd_oldest = 0;
16672
16673                 lck_mtx_unlock(&dtrace_lock);
16674
16675                 /*
16676                  * Finally, copy out the buffer description.
16677                  */
16678                 if (copyout(&desc, arg, sizeof (desc)) != 0)
16679                         return (EFAULT);
16680
16681                 return (0);
16682         }
16683
16684         case DTRACEIOC_CONF: {
16685                 dtrace_conf_t conf;
16686
16687                 bzero(&conf, sizeof (conf));
16688                 conf.dtc_difversion = DIF_VERSION;
16689                 conf.dtc_difintregs = DIF_DIR_NREGS;
16690                 conf.dtc_diftupregs = DIF_DTR_NREGS;
16691                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
16692
16693                 if (copyout(&conf, arg, sizeof (conf)) != 0)
16694                         return (EFAULT);
16695
16696                 return (0);
16697         }
16698
16699         case DTRACEIOC_STATUS: {
16700                 dtrace_status_t stat;
16701                 dtrace_dstate_t *dstate;
16702                 int i, j;
16703                 uint64_t nerrs;
16704
16705                 /*
16706                 * See the comment in dtrace_state_deadman() for the reason
16707                 * for setting dts_laststatus to INT64_MAX before setting
16708                 * it to the correct value.
16709                 */
16710                 state->dts_laststatus = INT64_MAX;
16711                 dtrace_membar_producer();
16712                 state->dts_laststatus = dtrace_gethrtime();
16713
16714                 bzero(&stat, sizeof (stat));
16715
16716                 lck_mtx_lock(&dtrace_lock);
16717
16718                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
16719                         lck_mtx_unlock(&dtrace_lock);
16720                         return (ENOENT);
16721                 }
16722
16723                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
16724                         stat.dtst_exiting = 1;
16725
16726                 nerrs = state->dts_errors;
16727                 dstate = &state->dts_vstate.dtvs_dynvars;
16728
16729                 for (i = 0; i < (int)NCPU; i++) {
16730                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
16731
16732                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
16733                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
16734                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
16735
16736                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
16737                                 stat.dtst_filled++;
16738
16739                         nerrs += state->dts_buffer[i].dtb_errors;
16740
16741                         for (j = 0; j < state->dts_nspeculations; j++) {
16742                                 dtrace_speculation_t *spec;
16743                                 dtrace_buffer_t *buf;
16744
16745                                 spec = &state->dts_speculations[j];
16746                                 buf = &spec->dtsp_buffer[i];
16747                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
16748                         }
16749                 }
16750
16751                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
16752                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
16753                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
16754                 stat.dtst_dblerrors = state->dts_dblerrors;
16755                 stat.dtst_killed =
16756                         (state->dts_activity == DTRACE_ACTIVITY_KILLED);
16757                 stat.dtst_errors = nerrs;
16758
16759                 lck_mtx_unlock(&dtrace_lock);
16760
16761                 if (copyout(&stat, arg, sizeof (stat)) != 0)
16762                         return (EFAULT);
16763
16764                 return (0);
16765         }
16766
16767         case DTRACEIOC_FORMAT: {
16768                 dtrace_fmtdesc_t fmt;
16769                 char *str;
16770                 int len;
16771
16772                 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
16773                         return (EFAULT);
16774
16775                 lck_mtx_lock(&dtrace_lock);
16776
16777                 if (fmt.dtfd_format == 0 ||
16778                         fmt.dtfd_format > state->dts_nformats) {
16779                         lck_mtx_unlock(&dtrace_lock);
16780                         return (EINVAL);
16781                 }
16782
16783                 /*
16784                  * Format strings are allocated contiguously and they are
16785                  * never freed; if a format index is less than the number
16786                  * of formats, we can assert that the format map is non-NULL
16787                  * and that the format for the specified index is non-NULL.
16788                  */
16789                 ASSERT(state->dts_formats != NULL);
16790                 str = state->dts_formats[fmt.dtfd_format - 1];
16791                 ASSERT(str != NULL);
16792
16793                 len = strlen(str) + 1;
16794
16795                 if (len > fmt.dtfd_length) {
16796                         fmt.dtfd_length = len;
16797
16798                         if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
16799                                 lck_mtx_unlock(&dtrace_lock);
16800                                 return (EINVAL);
16801                         }
16802                 } else {
16803                         if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
16804                                 lck_mtx_unlock(&dtrace_lock);
16805                                 return (EINVAL);
16806                         }
16807                 }
16808
16809                 lck_mtx_unlock(&dtrace_lock);
16810                 return (0);
16811         }
16812
16813         case DTRACEIOC_MODUUIDSLIST: {
16814                 size_t module_uuids_list_size;
16815                 dtrace_module_uuids_list_t* uuids_list;
16816                 uint64_t dtmul_count;
16817
16818                 /*
16819                  * Security restrictions make this operation illegal, if this is enabled DTrace
16820                  * must refuse to provide any fbt probes.
16821                  */
16822                 if (dtrace_is_restricted()) {
16823                         cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
16824                         return (EPERM);
16825                 }
16826
16827                 /*
16828                  * Fail if the kernel symbol mode makes this operation illegal.
16829                  * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
16830                  * for them without holding the dtrace_lock.
16831                  */
16832                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
16833                     dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
16834                         cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
16835                         return (EPERM);
16836                 }
16837
16838                 /*
16839                  * Read the number of symbolsdesc structs being passed in.
16840                  */
16841                 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
16842                            &dtmul_count,
16843                            sizeof(dtmul_count))) {
16844                         cmn_err(CE_WARN, "failed to copyin dtmul_count");
16845                         return (EFAULT);
16846                 }
16847
16848                 /*
16849                  * Range check the count. More than 2k kexts is probably an error.
16850                  */
16851                 if (dtmul_count > 2048) {
16852                         cmn_err(CE_WARN, "dtmul_count is not valid");
16853                         return (EINVAL);
16854                 }
16855
16856                 /*
16857                  * For all queries, we return EINVAL when the user specified
16858                  * count does not match the actual number of modules we find
16859                  * available.
16860                  *
16861                  * If the user specified count is zero, then this serves as a
16862                  * simple query to count the available modules in need of symbols.
16863                  */
16864
16865                 rval = 0;
16866
16867                 if (dtmul_count == 0)
16868                 {
16869                         lck_mtx_lock(&mod_lock);
16870                         struct modctl* ctl = dtrace_modctl_list;
16871                         while (ctl) {
16872                                 /* Update the private probes bit */
16873                                 if (dtrace_provide_private_probes)
16874                                         ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
16875
16876                                 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
16877                                 if (!MOD_SYMBOLS_DONE(ctl)) {
16878                                         dtmul_count++;
16879                                         rval = EINVAL;
16880                                 }
16881                                 ctl = ctl->mod_next;
16882                         }
16883                         lck_mtx_unlock(&mod_lock);
16884
16885                         if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
16886                                 return (EFAULT);
16887                         else
16888                                 return (rval);
16889                 }
16890
16891                 /*
16892                  * If we reach this point, then we have a request for full list data.
16893                  * Allocate a correctly sized structure and copyin the data.
16894                  */
16895                 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
16896                 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
16897                         return (ENOMEM);
16898
16899                 /* NOTE! We can no longer exit this method via return */
16900                 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
16901                         cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
16902                         rval = EFAULT;
16903                         goto moduuidslist_cleanup;
16904                 }
16905
16906                 /*
16907                  * Check that the count didn't change between the first copyin and the second.
16908                  */
16909                 if (uuids_list->dtmul_count != dtmul_count) {
16910                         rval = EINVAL;
16911                         goto moduuidslist_cleanup;
16912                 }
16913
16914                 /*
16915                  * Build the list of UUID's that need symbols
16916                  */
16917                 lck_mtx_lock(&mod_lock);
16918
16919                 dtmul_count = 0;
16920
16921                 struct modctl* ctl = dtrace_modctl_list;
16922                 while (ctl) {
16923                         /* Update the private probes bit */
16924                         if (dtrace_provide_private_probes)
16925                                 ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
16926
16927                         /*
16928                          * We assume that userspace symbols will be "better" than kernel level symbols,
16929                          * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
16930                          * are available, add user syms if the module might use them.
16931                          */
16932                         ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
16933                         if (!MOD_SYMBOLS_DONE(ctl)) {
16934                                 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
16935                                 if (dtmul_count++ < uuids_list->dtmul_count) {
16936                                         memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
16937                                 }
16938                         }
16939                         ctl = ctl->mod_next;
16940                 }
16941
16942                 lck_mtx_unlock(&mod_lock);
16943
16944                 if (uuids_list->dtmul_count < dtmul_count)
16945                         rval = EINVAL;
16946
16947                 uuids_list->dtmul_count = dtmul_count;
16948
16949                 /*
16950                  * Copyout the symbols list (or at least the count!)
16951                  */
16952                 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
16953                         cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
16954                         rval = EFAULT;
16955                 }
16956
16957         moduuidslist_cleanup:
16958                 /*
16959                  * If we had to allocate struct memory, free it.
16960                  */
16961                 if (uuids_list != NULL) {
16962                         kmem_free(uuids_list, module_uuids_list_size);
16963                 }
16964
16965                 return rval;
16966         }
16967
16968         case DTRACEIOC_PROVMODSYMS: {
16969                 size_t module_symbols_size;
16970                 dtrace_module_symbols_t* module_symbols;
16971                 uint64_t dtmodsyms_count;
16972
16973                 /*
16974                  * Security restrictions make this operation illegal, if this is enabled DTrace
16975                  * must refuse to provide any fbt probes.
16976                  */
16977                 if (dtrace_is_restricted()) {
16978                         cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
16979                         return (EPERM);
16980                 }
16981
16982                 /*
16983                  * Fail if the kernel symbol mode makes this operation illegal.
16984                  * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
16985                  * for them without holding the dtrace_lock.
16986                  */
16987                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
16988                     dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
16989                         cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
16990                         return (EPERM);
16991                 }
16992
16993                 /*
16994                  * Read the number of module symbols structs being passed in.
16995                  */
16996                 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
16997                            &dtmodsyms_count,
16998                            sizeof(dtmodsyms_count))) {
16999                         cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
17000                         return (EFAULT);
17001                 }
17002
17003                 /*
17004                  * Range check the count. How much data can we pass around?
17005                  * FIX ME!
17006                  */
17007                 if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) {
17008                         cmn_err(CE_WARN, "dtmodsyms_count is not valid");
17009                         return (EINVAL);
17010                 }
17011
17012                 /*
17013                  * Allocate a correctly sized structure and copyin the data.
17014                  */
17015                 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
17016                 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
17017                         return (ENOMEM);
17018
17019                 rval = 0;
17020
17021                 /* NOTE! We can no longer exit this method via return */
17022                 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
17023                         cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t, symbol count %llu", module_symbols->dtmodsyms_count);
17024                         rval = EFAULT;
17025                         goto module_symbols_cleanup;
17026                 }
17027
17028                 /*
17029                  * Check that the count didn't change between the first copyin and the second.
17030                  */
17031                 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
17032                         rval = EINVAL;
17033                         goto module_symbols_cleanup;
17034                 }
17035
17036                 /*
17037                  * Find the modctl to add symbols to.
17038                  */
17039                 lck_mtx_lock(&dtrace_provider_lock);
17040                 lck_mtx_lock(&mod_lock);
17041
17042                 struct modctl* ctl = dtrace_modctl_list;
17043                 while (ctl) {
17044                         /* Update the private probes bit */
17045                         if (dtrace_provide_private_probes)
17046                                 ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
17047
17048                         ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
17049                         if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl)) {
17050                                 if (memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
17051                                         /* BINGO! */
17052                                         ctl->mod_user_symbols = module_symbols;
17053                                         break;
17054                                 }
17055                         }
17056                         ctl = ctl->mod_next;
17057                 }
17058
17059                 if (ctl) {
17060                         dtrace_provider_t *prv;
17061
17062                         /*
17063                          * We're going to call each providers per-module provide operation
17064                          * specifying only this module.
17065                          */
17066                         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
17067                                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
17068
17069                         /*
17070                          * We gave every provider a chance to provide with the user syms, go ahead and clear them
17071                          */
17072                         ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
17073                 }
17074
17075                 lck_mtx_unlock(&mod_lock);
17076                 lck_mtx_unlock(&dtrace_provider_lock);
17077
17078         module_symbols_cleanup:
17079                 /*
17080                  * If we had to allocate struct memory, free it.
17081                  */
17082                 if (module_symbols != NULL) {
17083                         kmem_free(module_symbols, module_symbols_size);
17084                 }
17085
17086                 return rval;
17087         }
17088
17089         case DTRACEIOC_PROCWAITFOR: {
17090                 dtrace_procdesc_t pdesc = {
17091                         .p_comm = {0},
17092                         .p_pid  = -1
17093                 };
17094
17095                 if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != 0)
17096                         goto proc_waitfor_error;
17097
17098                 if ((rval = dtrace_proc_waitfor(&pdesc)) != 0)
17099                         goto proc_waitfor_error;
17100
17101                 if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != 0)
17102                         goto proc_waitfor_error;
17103
17104                 return 0;
17105
17106         proc_waitfor_error:
17107                 /* The process was suspended, revert this since the client will not do it. */
17108                 if (pdesc.p_pid != -1) {
17109                         proc_t *proc = proc_find(pdesc.p_pid);
17110                         if (proc != PROC_NULL) {
17111                                 task_pidresume(proc->task);
17112                                 proc_rele(proc);
17113                         }
17114                 }
17115
17116                 return rval;
17117         }
17118
17119         default:
17120                 break;
17121         }
17122
17123         return (ENOTTY);
17124 }
17125
17126 /*
17127  * APPLE NOTE:  dtrace_detach not implemented
17128  */
17129 #if !defined(__APPLE__)
17130 /*ARGSUSED*/
17131 static int
17132 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
17133 {
17134         dtrace_state_t *state;
17135
17136         switch (cmd) {
17137         case DDI_DETACH:
17138                 break;
17139
17140         case DDI_SUSPEND:
17141                 return (DDI_SUCCESS);
17142
17143         default:
17144                 return (DDI_FAILURE);
17145         }
17146
17147         lck_mtx_lock(&cpu_lock);
17148         lck_mtx_lock(&dtrace_provider_lock);
17149         lck_mtx_lock(&dtrace_lock);
17150
17151         ASSERT(dtrace_opens == 0);
17152
17153         if (dtrace_helpers > 0) {
17154                 lck_mtx_unlock(&dtrace_lock);
17155                 lck_mtx_unlock(&dtrace_provider_lock);
17156                 lck_mtx_unlock(&cpu_lock);
17157                 return (DDI_FAILURE);
17158         }
17159
17160         if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
17161                 lck_mtx_unlock(&dtrace_lock);
17162                 lck_mtx_unlock(&dtrace_provider_lock);
17163                 lck_mtx_unlock(&cpu_lock);
17164                 return (DDI_FAILURE);
17165         }
17166
17167         dtrace_provider = NULL;
17168
17169         if ((state = dtrace_anon_grab()) != NULL) {
17170                 /*
17171                  * If there were ECBs on this state, the provider should
17172                  * have not been allowed to detach; assert that there is
17173                  * none.
17174                  */
17175                 ASSERT(state->dts_necbs == 0);
17176                 dtrace_state_destroy(state);
17177
17178                 /*
17179                  * If we're being detached with anonymous state, we need to
17180                  * indicate to the kernel debugger that DTrace is now inactive.
17181                  */
17182                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17183         }
17184
17185         bzero(&dtrace_anon, sizeof (dtrace_anon_t));
17186         unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17187         dtrace_cpu_init = NULL;
17188         dtrace_helpers_cleanup = NULL;
17189         dtrace_helpers_fork = NULL;
17190         dtrace_cpustart_init = NULL;
17191         dtrace_cpustart_fini = NULL;
17192         dtrace_debugger_init = NULL;
17193         dtrace_debugger_fini = NULL;
17194         dtrace_kreloc_init = NULL;
17195         dtrace_kreloc_fini = NULL;
17196         dtrace_modload = NULL;
17197         dtrace_modunload = NULL;
17198
17199         lck_mtx_unlock(&cpu_lock);
17200
17201         if (dtrace_helptrace_enabled) {
17202                 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
17203                 dtrace_helptrace_buffer = NULL;
17204         }
17205
17206         kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
17207         dtrace_probes = NULL;
17208         dtrace_nprobes = 0;
17209
17210         dtrace_hash_destroy(dtrace_bymod);
17211         dtrace_hash_destroy(dtrace_byfunc);
17212         dtrace_hash_destroy(dtrace_byname);
17213         dtrace_bymod = NULL;
17214         dtrace_byfunc = NULL;
17215         dtrace_byname = NULL;
17216
17217         kmem_cache_destroy(dtrace_state_cache);
17218         vmem_destroy(dtrace_minor);
17219         vmem_destroy(dtrace_arena);
17220
17221         if (dtrace_toxrange != NULL) {
17222                 kmem_free(dtrace_toxrange,
17223                     dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
17224                 dtrace_toxrange = NULL;
17225                 dtrace_toxranges = 0;
17226                 dtrace_toxranges_max = 0;
17227         }
17228
17229         ddi_remove_minor_node(dtrace_devi, NULL);
17230         dtrace_devi = NULL;
17231
17232         ddi_soft_state_fini(&dtrace_softstate);
17233
17234         ASSERT(dtrace_vtime_references == 0);
17235         ASSERT(dtrace_opens == 0);
17236         ASSERT(dtrace_retained == NULL);
17237
17238         lck_mtx_unlock(&dtrace_lock);
17239         lck_mtx_unlock(&dtrace_provider_lock);
17240
17241         /*
17242          * We don't destroy the task queue until after we have dropped our
17243          * locks (taskq_destroy() may block on running tasks).  To prevent
17244          * attempting to do work after we have effectively detached but before
17245          * the task queue has been destroyed, all tasks dispatched via the
17246          * task queue must check that DTrace is still attached before
17247          * performing any operation.
17248          */
17249         taskq_destroy(dtrace_taskq);
17250         dtrace_taskq = NULL;
17251
17252         return (DDI_SUCCESS);
17253 }
17254 #endif  /* __APPLE__ */
17255
17256 d_open_t _dtrace_open, helper_open;
17257 d_close_t _dtrace_close, helper_close;
17258 d_ioctl_t _dtrace_ioctl, helper_ioctl;
17259
17260 int
17261 _dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
17262 {
17263 #pragma unused(p)
17264         dev_t locdev = dev;
17265
17266         return  dtrace_open( &locdev, flags, devtype, CRED());
17267 }
17268
17269 int
17270 helper_open(dev_t dev, int flags, int devtype, struct proc *p)
17271 {
17272 #pragma unused(dev,flags,devtype,p)
17273         return 0;
17274 }
17275
17276 int
17277 _dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
17278 {
17279 #pragma unused(p)
17280         return dtrace_close( dev, flags, devtype, CRED());
17281 }
17282
17283 int
17284 helper_close(dev_t dev, int flags, int devtype, struct proc *p)
17285 {
17286 #pragma unused(dev,flags,devtype,p)
17287         return 0;
17288 }
17289
17290 int
17291 _dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
17292 {
17293 #pragma unused(p)
17294         int err, rv = 0;
17295     user_addr_t uaddrp;
17296
17297     if (proc_is64bit(p))
17298                 uaddrp = *(user_addr_t *)data;
17299         else
17300                 uaddrp = (user_addr_t) *(uint32_t *)data;
17301
17302         err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
17303
17304         /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
17305         if (err != 0) {
17306                 ASSERT( (err & 0xfffff000) == 0 );
17307                 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
17308         } else if (rv != 0) {
17309                 ASSERT( (rv & 0xfff00000) == 0 );
17310                 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
17311         } else
17312                 return 0;
17313 }
17314
17315 int
17316 helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
17317 {
17318 #pragma unused(dev,fflag,p)
17319         int err, rv = 0;
17320
17321         err = dtrace_ioctl_helper(cmd, data, &rv);
17322         /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
17323         if (err != 0) {
17324                 ASSERT( (err & 0xfffff000) == 0 );
17325                 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
17326         } else if (rv != 0) {
17327                 ASSERT( (rv & 0xfff00000) == 0 );
17328                 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
17329         } else
17330                 return 0;
17331 }
17332
17333 #define HELPER_MAJOR  -24 /* let the kernel pick the device number */
17334
17335 /*
17336  * A struct describing which functions will get invoked for certain
17337  * actions.
17338  */
17339 static struct cdevsw helper_cdevsw =
17340 {
17341         helper_open,            /* open */
17342         helper_close,           /* close */
17343         eno_rdwrt,                      /* read */
17344         eno_rdwrt,                      /* write */
17345         helper_ioctl,           /* ioctl */
17346         (stop_fcn_t *)nulldev, /* stop */
17347         (reset_fcn_t *)nulldev, /* reset */
17348         NULL,                           /* tty's */
17349         eno_select,                     /* select */
17350         eno_mmap,                       /* mmap */
17351         eno_strat,                      /* strategy */
17352         eno_getc,                       /* getc */
17353         eno_putc,                       /* putc */
17354         0                                       /* type */
17355 };
17356
17357 static int helper_majdevno = 0;
17358
17359 static int gDTraceInited = 0;
17360
17361 void
17362 helper_init( void )
17363 {
17364         /*
17365          * Once the "helper" is initialized, it can take ioctl calls that use locks
17366          * and zones initialized in dtrace_init. Make certain dtrace_init was called
17367          * before us.
17368          */
17369
17370         if (!gDTraceInited) {
17371                 panic("helper_init before dtrace_init\n");
17372         }
17373
17374         if (0 >= helper_majdevno)
17375         {
17376                 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
17377
17378                 if (helper_majdevno < 0) {
17379                         printf("helper_init: failed to allocate a major number!\n");
17380                         return;
17381                 }
17382
17383                 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
17384                                         DTRACEMNR_HELPER, 0 )) {
17385                         printf("dtrace_init: failed to devfs_make_node for helper!\n");
17386                         return;
17387                 }
17388         } else
17389                 panic("helper_init: called twice!\n");
17390 }
17391
17392 #undef HELPER_MAJOR
17393
17394 /*
17395  * Called with DEVFS_LOCK held, so vmem_alloc's underlying blist structures are protected.
17396  */
17397 static int
17398 dtrace_clone_func(dev_t dev, int action)
17399 {
17400 #pragma unused(dev)
17401
17402         if (action == DEVFS_CLONE_ALLOC) {
17403                 if (NULL == dtrace_minor) /* Arena not created yet!?! */
17404                         return 0;
17405                 else {
17406                         /*
17407                          * Propose a minor number, namely the next number that vmem_alloc() will return.
17408                          * Immediately put it back in play by calling vmem_free(). FIXME.
17409                          */
17410                         int ret = (int)(uintptr_t)vmem_alloc(dtrace_minor, 1, VM_BESTFIT | VM_SLEEP);
17411
17412                         vmem_free(dtrace_minor, (void *)(uintptr_t)ret, 1);
17413
17414                         return ret;
17415                 }
17416         }
17417         else if (action == DEVFS_CLONE_FREE) {
17418                 return 0;
17419         }
17420         else return -1;
17421 }
17422
17423 #define DTRACE_MAJOR  -24 /* let the kernel pick the device number */
17424
17425 static struct cdevsw dtrace_cdevsw =
17426 {
17427         _dtrace_open,           /* open */
17428         _dtrace_close,          /* close */
17429         eno_rdwrt,                      /* read */
17430         eno_rdwrt,                      /* write */
17431         _dtrace_ioctl,          /* ioctl */
17432         (stop_fcn_t *)nulldev, /* stop */
17433         (reset_fcn_t *)nulldev, /* reset */
17434         NULL,                           /* tty's */
17435         eno_select,                     /* select */
17436         eno_mmap,                       /* mmap */
17437         eno_strat,                      /* strategy */
17438         eno_getc,                       /* getc */
17439         eno_putc,                       /* putc */
17440         0                                       /* type */
17441 };
17442
17443 lck_attr_t* dtrace_lck_attr;
17444 lck_grp_attr_t* dtrace_lck_grp_attr;
17445 lck_grp_t* dtrace_lck_grp;
17446
17447 static int gMajDevNo;
17448
17449 void
17450 dtrace_init( void )
17451 {
17452         if (0 == gDTraceInited) {
17453                 int i, ncpu;
17454                 size_t size = sizeof(dtrace_buffer_memory_maxsize);
17455
17456                 /*
17457                  * DTrace allocates buffers based on the maximum number
17458                  * of enabled cpus. This call avoids any race when finding
17459                  * that count.
17460                  */
17461                 ASSERT(dtrace_max_cpus == 0);
17462                 ncpu = dtrace_max_cpus = ml_get_max_cpus();
17463
17464                 /*
17465                  * Retrieve the size of the physical memory in order to define
17466                  * the state buffer memory maximal size.  If we cannot retrieve
17467                  * this value, we'll consider that we have 1Gb of memory per CPU, that's
17468                  * still better than raising a kernel panic.
17469                  */
17470                 if (0 != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
17471                                              &size, NULL, 0))
17472                 {
17473                         dtrace_buffer_memory_maxsize = ncpu * 1024 * 1024 * 1024;
17474                         printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
17475                                dtrace_buffer_memory_maxsize);
17476                 }
17477
17478                 /*
17479                  * Finally, divide by three to prevent DTrace from eating too
17480                  * much memory.
17481                  */
17482                 dtrace_buffer_memory_maxsize /= 3;
17483                 ASSERT(dtrace_buffer_memory_maxsize > 0);
17484
17485                 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
17486
17487                 if (gMajDevNo < 0) {
17488                         printf("dtrace_init: failed to allocate a major number!\n");
17489                         gDTraceInited = 0;
17490                         return;
17491                 }
17492
17493                 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
17494                                         dtrace_clone_func, DTRACEMNR_DTRACE, 0 )) {
17495                         printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
17496                         gDTraceInited = 0;
17497                         return;
17498                 }
17499
17500 #if defined(DTRACE_MEMORY_ZONES)
17501                 /*
17502                  * Initialize the dtrace kalloc-emulation zones.
17503                  */
17504                 dtrace_alloc_init();
17505 #endif /* DTRACE_MEMORY_ZONES */
17506
17507                 /*
17508                  * Allocate the dtrace_probe_t zone
17509                  */
17510                 dtrace_probe_t_zone = zinit(sizeof(dtrace_probe_t),
17511                                             1024 * sizeof(dtrace_probe_t),
17512                                             sizeof(dtrace_probe_t),
17513                                             "dtrace.dtrace_probe_t");
17514
17515                 /*
17516                  * Create the dtrace lock group and attrs.
17517                  */
17518                 dtrace_lck_attr = lck_attr_alloc_init();
17519                 dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
17520                 dtrace_lck_grp = lck_grp_alloc_init("dtrace",  dtrace_lck_grp_attr);
17521
17522                 /*
17523                  * We have to initialize all locks explicitly
17524                  */
17525                 lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
17526                 lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
17527                 lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
17528                 lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr);
17529 #if DEBUG
17530                 lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
17531 #endif
17532                 lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
17533
17534                 /*
17535                  * The cpu_core structure consists of per-CPU state available in any context.
17536                  * On some architectures, this may mean that the page(s) containing the
17537                  * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
17538                  * is up to the platform to assure that this is performed properly.  Note that
17539                  * the structure is sized to avoid false sharing.
17540                  */
17541                 lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
17542                 lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr);
17543                 lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
17544
17545                 /*
17546                  * Initialize the CPU offline/online hooks.
17547                  */
17548                 dtrace_install_cpu_hooks();
17549
17550                 dtrace_modctl_list = NULL;
17551
17552                 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
17553                 for (i = 0; i < ncpu; ++i) {
17554                         lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
17555                 }
17556
17557                 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
17558                 for (i = 0; i < ncpu; ++i) {
17559                         cpu_list[i].cpu_id = (processorid_t)i;
17560                         cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
17561                         LIST_INIT(&cpu_list[i].cpu_cyc_list);
17562                         lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
17563                 }
17564
17565                 lck_mtx_lock(&cpu_lock);
17566                 for (i = 0; i < ncpu; ++i)
17567                         /* FIXME: track CPU configuration a la CHUD Processor Pref Pane. */
17568                         dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
17569                 lck_mtx_unlock(&cpu_lock);
17570
17571                 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
17572
17573                 dtrace_isa_init();
17574
17575                 /*
17576                  * See dtrace_impl.h for a description of dof modes.
17577                  * The default is lazy dof.
17578                  *
17579                  * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
17580                  * makes no sense...
17581                  */
17582                 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
17583                         dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17584                 }
17585
17586                 /*
17587                  * Sanity check of dof mode value.
17588                  */
17589                 switch (dtrace_dof_mode) {
17590                         case DTRACE_DOF_MODE_NEVER:
17591                         case DTRACE_DOF_MODE_LAZY_ON:
17592                                 /* valid modes, but nothing else we need to do */
17593                                 break;
17594
17595                         case DTRACE_DOF_MODE_LAZY_OFF:
17596                         case DTRACE_DOF_MODE_NON_LAZY:
17597                                 /* Cannot wait for a dtrace_open to init fasttrap */
17598                                 fasttrap_init();
17599                                 break;
17600
17601                         default:
17602                                 /* Invalid, clamp to non lazy */
17603                                 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
17604                                 fasttrap_init();
17605                                 break;
17606                 }
17607
17608                 /*
17609                  * See dtrace_impl.h for a description of kernel symbol modes.
17610                  * The default is to wait for symbols from userspace (lazy symbols).
17611                  */
17612                 if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
17613                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17614                 }
17615
17616                 gDTraceInited = 1;
17617
17618         } else
17619                 panic("dtrace_init: called twice!\n");
17620 }
17621
17622 void
17623 dtrace_postinit(void)
17624 {
17625         /*
17626          * Called from bsd_init after all provider's *_init() routines have been
17627          * run. That way, anonymous DOF enabled under dtrace_attach() is safe
17628          * to go.
17629          */
17630         dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0), 0 ); /* Punning a dev_t to a dev_info_t* */
17631
17632         /*
17633          * Add the mach_kernel to the module list for lazy processing
17634          */
17635         struct kmod_info fake_kernel_kmod;
17636         memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
17637
17638         strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
17639         fake_kernel_kmod.id = 1;
17640         fake_kernel_kmod.address = g_kernel_kmod_info.address;
17641         fake_kernel_kmod.size = g_kernel_kmod_info.size;
17642
17643         if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) {
17644                 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
17645         }
17646
17647         (void)OSKextRegisterKextsWithDTrace();
17648 }
17649 #undef DTRACE_MAJOR
17650
17651 /*
17652  * Routines used to register interest in cpu's being added to or removed
17653  * from the system.
17654  */
17655 void
17656 register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
17657 {
17658 #pragma unused(ignore1,ignore2)
17659 }
17660
17661 void
17662 unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
17663 {
17664 #pragma unused(ignore1,ignore2)
17665 }