bsd/dev/dtrace/dtrace.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved.
  24  * Portions Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26
  27 /*
  28  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29  * Use is subject to license terms.
  30  */
  31
  32 /*
  33  * DTrace - Dynamic Tracing for Solaris
  34  *
  35  * This is the implementation of the Solaris Dynamic Tracing framework
  36  * (DTrace).  The user-visible interface to DTrace is described at length in
  37  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
  38  * library, the in-kernel DTrace framework, and the DTrace providers are
  39  * described in the block comments in the <sys/dtrace.h> header file.  The
  40  * internal architecture of DTrace is described in the block comments in the
  41  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
  42  * implementation very much assume mastery of all of these sources; if one has
  43  * an unanswered question about the implementation, one should consult them
  44  * first.
  45  *
  46  * The functions here are ordered roughly as follows:
  47  *
  48  *   - Probe context functions
  49  *   - Probe hashing functions
  50  *   - Non-probe context utility functions
  51  *   - Matching functions
  52  *   - Provider-to-Framework API functions
  53  *   - Probe management functions
  54  *   - DIF object functions
  55  *   - Format functions
  56  *   - Predicate functions
  57  *   - ECB functions
  58  *   - Buffer functions
  59  *   - Enabling functions
  60  *   - DOF functions
  61  *   - Anonymous enabling functions
  62  *   - Process functions
  63  *   - Consumer state functions
  64  *   - Helper functions
  65  *   - Hook functions
  66  *   - Driver cookbook functions
  67  *
  68  * Each group of functions begins with a block comment labelled the "DTrace
  69  * [Group] Functions", allowing one to find each block by searching forward
  70  * on capital-f functions.
  71  */
  72 #include <sys/errno.h>
  73 #include <sys/types.h>
  74 #include <sys/stat.h>
  75 #include <sys/conf.h>
  76 #include <sys/random.h>
  77 #include <sys/systm.h>
  78 #include <sys/dtrace_impl.h>
  79 #include <sys/param.h>
  80 #include <sys/proc_internal.h>
  81 #include <sys/ioctl.h>
  82 #include <sys/fcntl.h>
  83 #include <miscfs/devfs/devfs.h>
  84 #include <sys/malloc.h>
  85 #include <sys/kernel_types.h>
  86 #include <sys/proc_internal.h>
  87 #include <sys/uio_internal.h>
  88 #include <sys/kauth.h>
  89 #include <vm/pmap.h>
  90 #include <sys/user.h>
  91 #include <mach/exception_types.h>
  92 #include <sys/signalvar.h>
  93 #include <mach/task.h>
  94 #include <kern/zalloc.h>
  95 #include <kern/ast.h>
  96 #include <kern/sched_prim.h>
  97 #include <kern/task.h>
  98 #include <netinet/in.h>
  99 #include <libkern/sysctl.h>
 100 #include <sys/kdebug.h>
 101
 102 #if MONOTONIC
 103 #include <kern/monotonic.h>
 104 #include <machine/monotonic.h>
 105 #endif /* MONOTONIC */
 106
 107 #include "dtrace_xoroshiro128_plus.h"
 108
 109 #include <IOKit/IOPlatformExpert.h>
 110
 111 #include <kern/cpu_data.h>
 112 extern uint32_t pmap_find_phys(void *, uint64_t);
 113 extern boolean_t pmap_valid_page(uint32_t);
 114 extern void OSKextRegisterKextsWithDTrace(void);
 115 extern kmod_info_t g_kernel_kmod_info;
 116 extern void commpage_update_dof(boolean_t enabled);
 117
 118 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
 119 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
 120
 121 #define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
 122
 123 extern void dtrace_suspend(void);
 124 extern void dtrace_resume(void);
 125 extern void dtrace_early_init(void);
 126 extern int dtrace_keep_kernel_symbols(void);
 127 extern void dtrace_init(void);
 128 extern void helper_init(void);
 129 extern void fasttrap_init(void);
 130
 131 static int  dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
 132 extern void dtrace_lazy_dofs_destroy(proc_t *);
 133 extern void dtrace_postinit(void);
 134
 135 extern void dtrace_proc_fork(proc_t*, proc_t*, int);
 136 extern void dtrace_proc_exec(proc_t*);
 137 extern void dtrace_proc_exit(proc_t*);
 138
 139 /*
 140  * DTrace Tunable Variables
 141  *
 142  * The following variables may be dynamically tuned by using sysctl(8), the
 143  * variables being stored in the kern.dtrace namespace.  For example:
 144  *      sysctl kern.dtrace.dof_maxsize = 1048575        # 1M
 145  *
 146  * In general, the only variables that one should be tuning this way are those
 147  * that affect system-wide DTrace behavior, and for which the default behavior
 148  * is undesirable.  Most of these variables are tunable on a per-consumer
 149  * basis using DTrace options, and need not be tuned on a system-wide basis.
 150  * When tuning these variables, avoid pathological values; while some attempt
 151  * is made to verify the integrity of these variables, they are not considered
 152  * part of the supported interface to DTrace, and they are therefore not
 153  * checked comprehensively.
 154  */
 155 uint64_t        dtrace_buffer_memory_maxsize = 0;               /* initialized in dtrace_init */
 156 uint64_t        dtrace_buffer_memory_inuse = 0;
 157 int             dtrace_destructive_disallow = 0;
 158 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 159 size_t          dtrace_difo_maxsize = (256 * 1024);
 160 dtrace_optval_t dtrace_dof_maxsize = (512 * 1024);
 161 dtrace_optval_t dtrace_statvar_maxsize = (16 * 1024);
 162 dtrace_optval_t dtrace_statvar_maxsize_max = (16 * 10 * 1024);
 163 size_t          dtrace_actions_max = (16 * 1024);
 164 size_t          dtrace_retain_max = 1024;
 165 dtrace_optval_t dtrace_helper_actions_max = 32;
 166 dtrace_optval_t dtrace_helper_providers_max = 64;
 167 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
 168 size_t          dtrace_strsize_default = 256;
 169 dtrace_optval_t dtrace_strsize_min = 8;
 170 dtrace_optval_t dtrace_strsize_max = 65536;
 171 dtrace_optval_t dtrace_cleanrate_default = 990099000;           /* 1.1 hz */
 172 dtrace_optval_t dtrace_cleanrate_min = 20000000;                        /* 50 hz */
 173 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
 174 dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
 175 dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
 176 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
 177 dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
 178 dtrace_optval_t dtrace_nspec_default = 1;
 179 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
 180 dtrace_optval_t dtrace_stackframes_default = 20;
 181 dtrace_optval_t dtrace_ustackframes_default = 20;
 182 dtrace_optval_t dtrace_jstackframes_default = 50;
 183 dtrace_optval_t dtrace_jstackstrsize_default = 512;
 184 dtrace_optval_t dtrace_buflimit_default = 75;
 185 dtrace_optval_t dtrace_buflimit_min = 1;
 186 dtrace_optval_t dtrace_buflimit_max = 99;
 187 size_t          dtrace_nprobes_default = 4;
 188 int             dtrace_msgdsize_max = 128;
 189 hrtime_t        dtrace_chill_max = 500 * (NANOSEC / MILLISEC);  /* 500 ms */
 190 hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
 191 int             dtrace_devdepth_max = 32;
 192 int             dtrace_err_verbose;
 193 hrtime_t        dtrace_deadman_interval = NANOSEC;
 194 hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 195 hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
 196
 197 /*
 198  * DTrace External Variables
 199  *
 200  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
 201  * available to DTrace consumers via the backtick (`) syntax.  One of these,
 202  * dtrace_zero, is made deliberately so:  it is provided as a source of
 203  * well-known, zero-filled memory.  While this variable is not documented,
 204  * it is used by some translators as an implementation detail.
 205  */
 206 const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
 207 unsigned int    dtrace_max_cpus = 0;            /* number of enabled cpus */
 208 /*
 209  * DTrace Internal Variables
 210  */
 211 static dev_info_t       *dtrace_devi;           /* device info */
 212 static vmem_t           *dtrace_arena;          /* probe ID arena */
 213 static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
 214 static int              dtrace_nprobes;         /* number of probes */
 215 static dtrace_provider_t *dtrace_provider;      /* provider list */
 216 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 217 static int              dtrace_opens;           /* number of opens */
 218 static int              dtrace_helpers;         /* number of helpers */
 219 static dtrace_hash_t    *dtrace_strings;
 220 static dtrace_hash_t    *dtrace_byprov;         /* probes hashed by provider */
 221 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 222 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 223 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 224 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
 225 static int              dtrace_toxranges;       /* number of toxic ranges */
 226 static int              dtrace_toxranges_max;   /* size of toxic range array */
 227 static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
 228 static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
 229 static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
 230 static kthread_t        *dtrace_panicked;       /* panicking thread */
 231 static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
 232 static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
 233 static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
 234 static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
 235 static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
 236 static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
 237
 238 static int              dtrace_dof_mode;        /* See dtrace_impl.h for a description of Darwin's dof modes. */
 239
 240                         /*
 241                          * This does't quite fit as an internal variable, as it must be accessed in
 242                          * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
 243                          */
 244 int                     dtrace_kernel_symbol_mode;      /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
 245 static uint32_t         dtrace_wake_clients;
 246 static uint8_t      dtrace_kerneluuid[16];      /* the 128-bit uuid */
 247
 248 /*
 249  * To save memory, some common memory allocations are given a
 250  * unique zone. For example, dtrace_probe_t is 72 bytes in size,
 251  * which means it would fall into the kalloc.128 bucket. With
 252  * 20k elements allocated, the space saved is substantial.
 253  */
 254
 255 struct zone *dtrace_probe_t_zone;
 256
 257 static int dtrace_module_unloaded(struct kmod_info *kmod);
 258
 259 /*
 260  * DTrace Locking
 261  * DTrace is protected by three (relatively coarse-grained) locks:
 262  *
 263  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
 264  *     including enabling state, probes, ECBs, consumer state, helper state,
 265  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
 266  *     probe context is lock-free -- synchronization is handled via the
 267  *     dtrace_sync() cross call mechanism.
 268  *
 269  * (2) dtrace_provider_lock is required when manipulating provider state, or
 270  *     when provider state must be held constant.
 271  *
 272  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
 273  *     when meta provider state must be held constant.
 274  *
 275  * The lock ordering between these three locks is dtrace_meta_lock before
 276  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
 277  * several places where dtrace_provider_lock is held by the framework as it
 278  * calls into the providers -- which then call back into the framework,
 279  * grabbing dtrace_lock.)
 280  *
 281  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
 282  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
 283  * role as a coarse-grained lock; it is acquired before both of these locks.
 284  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
 285  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
 286  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
 287  * acquired _between_ dtrace_provider_lock and dtrace_lock.
 288  */
 289
 290
 291 /*
 292  * APPLE NOTE:
 293  *
 294  * For porting purposes, all kmutex_t vars have been changed
 295  * to lck_mtx_t, which require explicit initialization.
 296  *
 297  * kmutex_t becomes lck_mtx_t
 298  * mutex_enter() becomes lck_mtx_lock()
 299  * mutex_exit() becomes lck_mtx_unlock()
 300  *
 301  * Lock asserts are changed like this:
 302  *
 303  * ASSERT(MUTEX_HELD(&cpu_lock));
 304  *      becomes:
 305  * LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
 306  *
 307  */
 308 static lck_mtx_t        dtrace_lock;            /* probe state lock */
 309 static lck_mtx_t        dtrace_provider_lock;   /* provider state lock */
 310 static lck_mtx_t        dtrace_meta_lock;       /* meta-provider state lock */
 311 static lck_rw_t         dtrace_dof_mode_lock;   /* dof mode lock */
 312
 313 /*
 314  * DTrace Provider Variables
 315  *
 316  * These are the variables relating to DTrace as a provider (that is, the
 317  * provider of the BEGIN, END, and ERROR probes).
 318  */
 319 static dtrace_pattr_t   dtrace_provider_attr = {
 320 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 321 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 322 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 323 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 324 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 325 };
 326
 327 static void
 328 dtrace_provide_nullop(void *arg, const dtrace_probedesc_t *desc)
 329 {
 330 #pragma unused(arg, desc)
 331 }
 332
 333 static void
 334 dtrace_provide_module_nullop(void *arg, struct modctl *ctl)
 335 {
 336 #pragma unused(arg, ctl)
 337 }
 338
 339 static int
 340 dtrace_enable_nullop(void *arg, dtrace_id_t id, void *parg)
 341 {
 342 #pragma unused(arg, id, parg)
 343     return (0);
 344 }
 345
 346 static void
 347 dtrace_disable_nullop(void *arg, dtrace_id_t id, void *parg)
 348 {
 349 #pragma unused(arg, id, parg)
 350 }
 351
 352 static void
 353 dtrace_suspend_nullop(void *arg, dtrace_id_t id, void *parg)
 354 {
 355 #pragma unused(arg, id, parg)
 356 }
 357
 358 static void
 359 dtrace_resume_nullop(void *arg, dtrace_id_t id, void *parg)
 360 {
 361 #pragma unused(arg, id, parg)
 362 }
 363
 364 static void
 365 dtrace_destroy_nullop(void *arg, dtrace_id_t id, void *parg)
 366 {
 367 #pragma unused(arg, id, parg)
 368 }
 369
 370
 371 static dtrace_pops_t dtrace_provider_ops = {
 372         .dtps_provide = dtrace_provide_nullop,
 373         .dtps_provide_module =  dtrace_provide_module_nullop,
 374         .dtps_enable =  dtrace_enable_nullop,
 375         .dtps_disable = dtrace_disable_nullop,
 376         .dtps_suspend = dtrace_suspend_nullop,
 377         .dtps_resume =  dtrace_resume_nullop,
 378         .dtps_getargdesc =      NULL,
 379         .dtps_getargval =       NULL,
 380         .dtps_usermode =        NULL,
 381         .dtps_destroy = dtrace_destroy_nullop,
 382 };
 383
 384 static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
 385 static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
 386 dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
 387
 388 /*
 389  * DTrace Helper Tracing Variables
 390  */
 391 uint32_t dtrace_helptrace_next = 0;
 392 uint32_t dtrace_helptrace_nlocals;
 393 char    *dtrace_helptrace_buffer;
 394 size_t  dtrace_helptrace_bufsize = 512 * 1024;
 395
 396 #if DEBUG
 397 int     dtrace_helptrace_enabled = 1;
 398 #else
 399 int     dtrace_helptrace_enabled = 0;
 400 #endif
 401
 402 #if defined (__arm64__)
 403 /*
 404  * The ioctl for adding helper DOF is based on the
 405  * size of a user_addr_t.  We need to recognize both
 406  * U32 and U64 as the same action.
 407  */
 408 #define DTRACEHIOC_ADDDOF_U32       _IOW('h', 4, user32_addr_t)
 409 #define DTRACEHIOC_ADDDOF_U64       _IOW('h', 4, user64_addr_t)
 410 #endif  /* __arm64__ */
 411
 412 /*
 413  * DTrace Error Hashing
 414  *
 415  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
 416  * table.  This is very useful for checking coverage of tests that are
 417  * expected to induce DIF or DOF processing errors, and may be useful for
 418  * debugging problems in the DIF code generator or in DOF generation .  The
 419  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
 420  */
 421 #if DEBUG
 422 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
 423 static const char *dtrace_errlast;
 424 static kthread_t *dtrace_errthread;
 425 static lck_mtx_t dtrace_errlock;
 426 #endif
 427
 428 /*
 429  * DTrace Macros and Constants
 430  *
 431  * These are various macros that are useful in various spots in the
 432  * implementation, along with a few random constants that have no meaning
 433  * outside of the implementation.  There is no real structure to this cpp
 434  * mishmash -- but is there ever?
 435  */
 436
 437 #define DTRACE_GETSTR(hash, elm)        \
 438         (hash->dth_getstr(elm, hash->dth_stroffs))
 439
 440 #define DTRACE_HASHSTR(hash, elm)       \
 441         dtrace_hash_str(DTRACE_GETSTR(hash, elm))
 442
 443 #define DTRACE_HASHNEXT(hash, elm)      \
 444         (void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
 445
 446 #define DTRACE_HASHPREV(hash, elm)      \
 447         (void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
 448
 449 #define DTRACE_HASHEQ(hash, lhs, rhs)   \
 450         (strcmp(DTRACE_GETSTR(hash, lhs), \
 451             DTRACE_GETSTR(hash, rhs)) == 0)
 452
 453 #define DTRACE_AGGHASHSIZE_SLEW         17
 454
 455 #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
 456
 457 /*
 458  * The key for a thread-local variable consists of the lower 61 bits of the
 459  * current_thread(), plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
 460  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
 461  * equal to a variable identifier.  This is necessary (but not sufficient) to
 462  * assure that global associative arrays never collide with thread-local
 463  * variables.  To guarantee that they cannot collide, we must also define the
 464  * order for keying dynamic variables.  That order is:
 465  *
 466  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
 467  *
 468  * Because the variable-key and the tls-key are in orthogonal spaces, there is
 469  * no way for a global variable key signature to match a thread-local key
 470  * signature.
 471  */
 472 #if defined (__x86_64__)
 473 /* FIXME: two function calls!! */
 474 #define DTRACE_TLS_THRKEY(where) { \
 475         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 476         uint64_t thr = (uintptr_t)current_thread(); \
 477         ASSERT(intr < (1 << 3)); \
 478         (where) = ((thr + DIF_VARIABLE_MAX) & \
 479             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 480 }
 481 #elif defined(__arm__)
 482 /* FIXME: three function calls!!! */
 483 #define DTRACE_TLS_THRKEY(where) { \
 484         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 485         uint64_t thr = (uintptr_t)current_thread(); \
 486         uint_t pid = (uint_t)dtrace_proc_selfpid(); \
 487         ASSERT(intr < (1 << 3)); \
 488         (where) = (((thr << 32 | pid) + DIF_VARIABLE_MAX) & \
 489             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 490 }
 491 #elif defined (__arm64__)
 492 /* FIXME: two function calls!! */
 493 #define DTRACE_TLS_THRKEY(where) { \
 494         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 495         uint64_t thr = (uintptr_t)current_thread(); \
 496         ASSERT(intr < (1 << 3)); \
 497         (where) = ((thr + DIF_VARIABLE_MAX) & \
 498             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 499 }
 500 #else
 501 #error Unknown architecture
 502 #endif
 503
 504 #define DT_BSWAP_8(x)   ((x) & 0xff)
 505 #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
 506 #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
 507 #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
 508
 509 #define DT_MASK_LO 0x00000000FFFFFFFFULL
 510
 511 #define DTRACE_STORE(type, tomax, offset, what) \
 512         *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
 513
 514
 515 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
 516         if (addr & (MIN(size,4) - 1)) {                                 \
 517                 *flags |= CPU_DTRACE_BADALIGN;                          \
 518                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 519                 return (0);                                             \
 520         }
 521
 522 #define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz)               \
 523 do {                                                                    \
 524         if ((remp) != NULL) {                                           \
 525                 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr);    \
 526         }                                                               \
 527 } while (0)
 528
 529
 530 /*
 531  * Test whether a range of memory starting at testaddr of size testsz falls
 532  * within the range of memory described by addr, sz.  We take care to avoid
 533  * problems with overflow and underflow of the unsigned quantities, and
 534  * disallow all negative sizes.  Ranges of size 0 are allowed.
 535  */
 536 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
 537         ((testaddr) - (baseaddr) < (basesz) && \
 538         (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
 539         (testaddr) + (testsz) >= (testaddr))
 540
 541 /*
 542  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
 543  * alloc_sz on the righthand side of the comparison in order to avoid overflow
 544  * or underflow in the comparison with it.  This is simpler than the INRANGE
 545  * check above, because we know that the dtms_scratch_ptr is valid in the
 546  * range.  Allocations of size zero are allowed.
 547  */
 548 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
 549         ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
 550         (mstate)->dtms_scratch_ptr >= (alloc_sz))
 551
 552 #define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
 553
 554 #if defined (__x86_64__) || (defined (__arm__) || defined (__arm64__))
 555 #define DTRACE_LOADFUNC(bits)                                           \
 556 /*CSTYLED*/                                                             \
 557 uint##bits##_t dtrace_load##bits(uintptr_t addr);                       \
 558                                                                         \
 559 uint##bits##_t                                                          \
 560 dtrace_load##bits(uintptr_t addr)                                       \
 561 {                                                                       \
 562         size_t size = bits / NBBY;                                      \
 563         /*CSTYLED*/                                                     \
 564         uint##bits##_t rval = 0;                                        \
 565         int i;                                                          \
 566         volatile uint16_t *flags = (volatile uint16_t *)                \
 567             &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;                   \
 568                                                                         \
 569         DTRACE_ALIGNCHECK(addr, size, flags);                           \
 570                                                                         \
 571         for (i = 0; i < dtrace_toxranges; i++) {                        \
 572                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
 573                         continue;                                       \
 574                                                                         \
 575                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
 576                         continue;                                       \
 577                                                                         \
 578                 /*                                                      \
 579                  * This address falls within a toxic region; return 0.  \
 580                  */                                                     \
 581                 *flags |= CPU_DTRACE_BADADDR;                           \
 582                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 583                 return (0);                                             \
 584         }                                                               \
 585                                                                         \
 586         {                                                               \
 587         volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits;          \
 588         *flags |= CPU_DTRACE_NOFAULT;                                   \
 589         recover = dtrace_sign_and_set_thread_recover(current_thread(), recover);        \
 590         /*CSTYLED*/                                                     \
 591         /*                                                              \
 592         * PR6394061 - avoid device memory that is unpredictably         \
 593         * mapped and unmapped                                           \
 594         */                                                              \
 595         if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr)))         \
 596             rval = *((volatile uint##bits##_t *)addr);                  \
 597         else {                                                          \
 598                 *flags |= CPU_DTRACE_BADADDR;                           \
 599                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 600                 return (0);                                             \
 601         }                                                               \
 602                                                                         \
 603         RECOVER_LABEL(bits);                                            \
 604         (void)dtrace_set_thread_recover(current_thread(), recover);     \
 605         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
 606         }                                                               \
 607                                                                         \
 608         return (rval);                                                  \
 609 }
 610 #else /* all other architectures */
 611 #error Unknown Architecture
 612 #endif
 613
 614 #ifdef __LP64__
 615 #define dtrace_loadptr  dtrace_load64
 616 #else
 617 #define dtrace_loadptr  dtrace_load32
 618 #endif
 619
 620 #define DTRACE_DYNHASH_FREE     0
 621 #define DTRACE_DYNHASH_SINK     1
 622 #define DTRACE_DYNHASH_VALID    2
 623
 624 #define DTRACE_MATCH_FAIL       -1
 625 #define DTRACE_MATCH_NEXT       0
 626 #define DTRACE_MATCH_DONE       1
 627 #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
 628 #define DTRACE_STATE_ALIGN      64
 629
 630 #define DTRACE_FLAGS2FLT(flags)                                         \
 631         (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :           \
 632         ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :                \
 633         ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :            \
 634         ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :                \
 635         ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :                \
 636         ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :         \
 637         ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :         \
 638         ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :       \
 639         ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :         \
 640         DTRACEFLT_UNKNOWN)
 641
 642 #define DTRACEACT_ISSTRING(act)                                         \
 643         ((act)->dta_kind == DTRACEACT_DIFEXPR &&                        \
 644         (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
 645
 646
 647 static size_t dtrace_strlen(const char *, size_t);
 648 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
 649 static void dtrace_enabling_provide(dtrace_provider_t *);
 650 static int dtrace_enabling_match(dtrace_enabling_t *, int *, dtrace_match_cond_t *cond);
 651 static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond);
 652 static void dtrace_enabling_matchall(void);
 653 static dtrace_state_t *dtrace_anon_grab(void);
 654 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
 655     dtrace_state_t *, uint64_t, uint64_t);
 656 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
 657 static void dtrace_buffer_drop(dtrace_buffer_t *);
 658 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
 659     dtrace_state_t *, dtrace_mstate_t *);
 660 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
 661     dtrace_optval_t);
 662 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *, void *);
 663 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
 664 static int dtrace_canload_remains(uint64_t, size_t, size_t *,
 665         dtrace_mstate_t *, dtrace_vstate_t *);
 666 static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
 667         dtrace_mstate_t *, dtrace_vstate_t *);
 668
 669
 670 /*
 671  * DTrace sysctl handlers
 672  *
 673  * These declarations and functions are used for a deeper DTrace configuration.
 674  * Most of them are not per-consumer basis and may impact the other DTrace
 675  * consumers.  Correctness may not be supported for all the variables, so you
 676  * should be careful about what values you are using.
 677  */
 678
 679 SYSCTL_DECL(_kern_dtrace);
 680 SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "dtrace");
 681
 682 static int
 683 sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
 684 {
 685 #pragma unused(oidp, arg2)
 686         int changed, error;
 687         int value = *(int *) arg1;
 688
 689         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 690         if (error || !changed)
 691                 return (error);
 692
 693         if (value != 0 && value != 1)
 694                 return (ERANGE);
 695
 696         lck_mtx_lock(&dtrace_lock);
 697                 dtrace_err_verbose = value;
 698         lck_mtx_unlock(&dtrace_lock);
 699
 700         return (0);
 701 }
 702
 703 /*
 704  * kern.dtrace.err_verbose
 705  *
 706  * Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
 707  * Errors are reported when a DIFO or a DOF has been rejected by the kernel.
 708  */
 709 SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
 710         CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 711         &dtrace_err_verbose, 0,
 712         sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
 713
 714 static int
 715 sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
 716 {
 717 #pragma unused(oidp, arg2, req)
 718         int changed, error;
 719         uint64_t value = *(uint64_t *) arg1;
 720
 721         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 722         if (error || !changed)
 723                 return (error);
 724
 725         if (value <= dtrace_buffer_memory_inuse)
 726                 return (ERANGE);
 727
 728         lck_mtx_lock(&dtrace_lock);
 729                 dtrace_buffer_memory_maxsize = value;
 730         lck_mtx_unlock(&dtrace_lock);
 731
 732         return (0);
 733 }
 734
 735 /*
 736  * kern.dtrace.buffer_memory_maxsize
 737  *
 738  * Set DTrace maximal size in bytes used by all the consumers' state buffers.  By default
 739  * the limit is PHYS_MEM / 3 for *all* consumers.  Attempting to set a null, a negative value
 740  * or a value <= to dtrace_buffer_memory_inuse will result in a failure.
 741  */
 742 SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
 743         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 744         &dtrace_buffer_memory_maxsize, 0,
 745         sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
 746
 747 /*
 748  * kern.dtrace.buffer_memory_inuse
 749  *
 750  * Current state buffer memory used, in bytes, by all the DTrace consumers.
 751  * This value is read-only.
 752  */
 753 SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD | CTLFLAG_LOCKED,
 754         &dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
 755
 756 static int
 757 sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
 758 {
 759 #pragma unused(oidp, arg2, req)
 760         int changed, error;
 761         size_t value = *(size_t*) arg1;
 762
 763         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 764         if (error || !changed)
 765                 return (error);
 766
 767         if (value <= 0)
 768                 return (ERANGE);
 769
 770         lck_mtx_lock(&dtrace_lock);
 771                 dtrace_difo_maxsize = value;
 772         lck_mtx_unlock(&dtrace_lock);
 773
 774         return (0);
 775 }
 776
 777 /*
 778  * kern.dtrace.difo_maxsize
 779  *
 780  * Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
 781  * to get the default value.  Attempting to set a null or negative size will
 782  * result in a failure.
 783  */
 784 SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
 785         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 786         &dtrace_difo_maxsize, 0,
 787         sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
 788
 789 static int
 790 sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
 791 {
 792 #pragma unused(oidp, arg2, req)
 793         int changed, error;
 794         dtrace_optval_t value = *(dtrace_optval_t *) arg1;
 795
 796         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 797         if (error || !changed)
 798                 return (error);
 799
 800         if (value <= 0)
 801                 return (ERANGE);
 802
 803         if (value >= dtrace_copy_maxsize())
 804                 return (ERANGE);
 805
 806         lck_mtx_lock(&dtrace_lock);
 807                 dtrace_dof_maxsize = value;
 808         lck_mtx_unlock(&dtrace_lock);
 809
 810         return (0);
 811 }
 812
 813 /*
 814  * kern.dtrace.dof_maxsize
 815  *
 816  * Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
 817  * get the default value.  Attempting to set a null or negative size will result
 818  * in a failure.
 819  */
 820 SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
 821         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 822         &dtrace_dof_maxsize, 0,
 823         sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
 824
 825 static int
 826 sysctl_dtrace_statvar_maxsize SYSCTL_HANDLER_ARGS
 827 {
 828 #pragma unused(oidp, arg2, req)
 829         int changed, error;
 830         dtrace_optval_t value = *(dtrace_optval_t*) arg1;
 831
 832         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 833         if (error || !changed)
 834                 return (error);
 835
 836         if (value <= 0)
 837                 return (ERANGE);
 838         if (value > dtrace_statvar_maxsize_max)
 839                 return (ERANGE);
 840
 841         lck_mtx_lock(&dtrace_lock);
 842                 dtrace_statvar_maxsize = value;
 843         lck_mtx_unlock(&dtrace_lock);
 844
 845         return (0);
 846 }
 847
 848 /*
 849  * kern.dtrace.global_maxsize
 850  *
 851  * Set the variable max size in bytes, check the definition of
 852  * dtrace_statvar_maxsize to get the default value.  Attempting to set a null,
 853  * too high or negative size will result in a failure.
 854  */
 855 SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
 856         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 857         &dtrace_statvar_maxsize, 0,
 858         sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
 859
 860
 861 /*
 862  * kern.dtrace.provide_private_probes
 863  *
 864  * Set whether the providers must provide the private probes.  This is
 865  * kept as compatibility as they are always provided.
 866  */
 867 SYSCTL_INT(_kern_dtrace, OID_AUTO, provide_private_probes,
 868         CTLFLAG_RD | CTLFLAG_LOCKED,
 869         (int *)NULL, 1, "provider must provide the private probes");
 870
 871 /*
 872  * kern.dtrace.dof_mode
 873  *
 874  * Returns the current DOF mode.
 875  * This value is read-only.
 876  */
 877 SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD | CTLFLAG_LOCKED,
 878         &dtrace_dof_mode, 0, "dtrace dof mode");
 879
 880 /*
 881  * DTrace Probe Context Functions
 882  *
 883  * These functions are called from probe context.  Because probe context is
 884  * any context in which C may be called, arbitrarily locks may be held,
 885  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
 886  * As a result, functions called from probe context may only call other DTrace
 887  * support functions -- they may not interact at all with the system at large.
 888  * (Note that the ASSERT macro is made probe-context safe by redefining it in
 889  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
 890  * loads are to be performed from probe context, they _must_ be in terms of
 891  * the safe dtrace_load*() variants.
 892  *
 893  * Some functions in this block are not actually called from probe context;
 894  * for these functions, there will be a comment above the function reading
 895  * "Note:  not called from probe context."
 896  */
 897
 898 int
 899 dtrace_assfail(const char *a, const char *f, int l)
 900 {
 901         panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
 902
 903         /*
 904          * We just need something here that even the most clever compiler
 905          * cannot optimize away.
 906          */
 907         return (a[(uintptr_t)f]);
 908 }
 909
 910 /*
 911  * Atomically increment a specified error counter from probe context.
 912  */
 913 static void
 914 dtrace_error(uint32_t *counter)
 915 {
 916         /*
 917          * Most counters stored to in probe context are per-CPU counters.
 918          * However, there are some error conditions that are sufficiently
 919          * arcane that they don't merit per-CPU storage.  If these counters
 920          * are incremented concurrently on different CPUs, scalability will be
 921          * adversely affected -- but we don't expect them to be white-hot in a
 922          * correctly constructed enabling...
 923          */
 924         uint32_t oval, nval;
 925
 926         do {
 927                 oval = *counter;
 928
 929                 if ((nval = oval + 1) == 0) {
 930                         /*
 931                          * If the counter would wrap, set it to 1 -- assuring
 932                          * that the counter is never zero when we have seen
 933                          * errors.  (The counter must be 32-bits because we
 934                          * aren't guaranteed a 64-bit compare&swap operation.)
 935                          * To save this code both the infamy of being fingered
 936                          * by a priggish news story and the indignity of being
 937                          * the target of a neo-puritan witch trial, we're
 938                          * carefully avoiding any colorful description of the
 939                          * likelihood of this condition -- but suffice it to
 940                          * say that it is only slightly more likely than the
 941                          * overflow of predicate cache IDs, as discussed in
 942                          * dtrace_predicate_create().
 943                          */
 944                         nval = 1;
 945                 }
 946         } while (dtrace_cas32(counter, oval, nval) != oval);
 947 }
 948
 949 /*
 950  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
 951  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
 952  */
 953 DTRACE_LOADFUNC(8)
 954 DTRACE_LOADFUNC(16)
 955 DTRACE_LOADFUNC(32)
 956 DTRACE_LOADFUNC(64)
 957
 958 static int
 959 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
 960 {
 961         if (dest < mstate->dtms_scratch_base)
 962                 return (0);
 963
 964         if (dest + size < dest)
 965                 return (0);
 966
 967         if (dest + size > mstate->dtms_scratch_ptr)
 968                 return (0);
 969
 970         return (1);
 971 }
 972
 973 static int
 974 dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
 975     dtrace_statvar_t **svars, int nsvars)
 976 {
 977         int i;
 978
 979         size_t maxglobalsize, maxlocalsize;
 980
 981         maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
 982         maxlocalsize = (maxglobalsize) * NCPU;
 983
 984         if (nsvars == 0)
 985                 return (0);
 986
 987         for (i = 0; i < nsvars; i++) {
 988                 dtrace_statvar_t *svar = svars[i];
 989                 uint8_t scope;
 990                 size_t size;
 991
 992                 if (svar == NULL || (size = svar->dtsv_size) == 0)
 993                         continue;
 994
 995                 scope = svar->dtsv_var.dtdv_scope;
 996
 997                 /**
 998                  * We verify that our size is valid in the spirit of providing
 999                  * defense in depth:  we want to prevent attackers from using
1000                  * DTrace to escalate an orthogonal kernel heap corruption bug
1001                  * into the ability to store to arbitrary locations in memory.
1002                  */
1003                 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
1004                         (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
1005
1006                 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) {
1007                         DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
1008                                 svar->dtsv_size);
1009                         return (1);
1010                 }
1011         }
1012
1013         return (0);
1014 }
1015
1016 /*
1017  * Check to see if the address is within a memory region to which a store may
1018  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
1019  * region.  The caller of dtrace_canstore() is responsible for performing any
1020  * alignment checks that are needed before stores are actually executed.
1021  */
1022 static int
1023 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1024     dtrace_vstate_t *vstate)
1025 {
1026         return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
1027 }
1028 /*
1029  * Implementation of dtrace_canstore which communicates the upper bound of the
1030  * allowed memory region.
1031  */
1032 static int
1033 dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
1034         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1035 {
1036         /*
1037          * First, check to see if the address is in scratch space...
1038          */
1039         if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
1040             mstate->dtms_scratch_size)) {
1041                 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
1042                         mstate->dtms_scratch_size);
1043                 return (1);
1044         }
1045         /*
1046          * Now check to see if it's a dynamic variable.  This check will pick
1047          * up both thread-local variables and any global dynamically-allocated
1048          * variables.
1049          */
1050         if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
1051             vstate->dtvs_dynvars.dtds_size)) {
1052                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
1053                 uintptr_t base = (uintptr_t)dstate->dtds_base +
1054                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
1055                 uintptr_t chunkoffs;
1056                 dtrace_dynvar_t *dvar;
1057
1058                 /*
1059                  * Before we assume that we can store here, we need to make
1060                  * sure that it isn't in our metadata -- storing to our
1061                  * dynamic variable metadata would corrupt our state.  For
1062                  * the range to not include any dynamic variable metadata,
1063                  * it must:
1064                  *
1065                  *      (1) Start above the hash table that is at the base of
1066                  *      the dynamic variable space
1067                  *
1068                  *      (2) Have a starting chunk offset that is beyond the
1069                  *      dtrace_dynvar_t that is at the base of every chunk
1070                  *
1071                  *      (3) Not span a chunk boundary
1072                  *
1073                  *      (4) Not be in the tuple space of a dynamic variable
1074                  *
1075                  */
1076                 if (addr < base)
1077                         return (0);
1078
1079                 chunkoffs = (addr - base) % dstate->dtds_chunksize;
1080
1081                 if (chunkoffs < sizeof (dtrace_dynvar_t))
1082                         return (0);
1083
1084                 if (chunkoffs + sz > dstate->dtds_chunksize)
1085                         return (0);
1086
1087                 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
1088
1089                 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
1090                         return (0);
1091
1092                 if (chunkoffs < sizeof (dtrace_dynvar_t) +
1093                         ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
1094                         return (0);
1095
1096                 return (1);
1097         }
1098
1099         /*
1100          * Finally, check the static local and global variables.  These checks
1101          * take the longest, so we perform them last.
1102          */
1103         if (dtrace_canstore_statvar(addr, sz, remain,
1104             vstate->dtvs_locals, vstate->dtvs_nlocals))
1105                 return (1);
1106
1107         if (dtrace_canstore_statvar(addr, sz, remain,
1108             vstate->dtvs_globals, vstate->dtvs_nglobals))
1109                 return (1);
1110
1111         return (0);
1112 }
1113
1114
1115 /*
1116  * Convenience routine to check to see if the address is within a memory
1117  * region in which a load may be issued given the user's privilege level;
1118  * if not, it sets the appropriate error flags and loads 'addr' into the
1119  * illegal value slot.
1120  *
1121  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
1122  * appropriate memory access protection.
1123  */
1124 int
1125 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1126     dtrace_vstate_t *vstate)
1127 {
1128         return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
1129 }
1130
1131 /*
1132  * Implementation of dtrace_canload which communicates the upper bound of the
1133  * allowed memory region.
1134  */
1135 static int
1136 dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
1137         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1138 {
1139         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
1140
1141         /*
1142          * If we hold the privilege to read from kernel memory, then
1143          * everything is readable.
1144          */
1145         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1146                 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1147                 return (1);
1148         }
1149
1150         /*
1151          * You can obviously read that which you can store.
1152          */
1153         if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
1154                 return (1);
1155
1156         /*
1157          * We're allowed to read from our own string table.
1158          */
1159         if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1160             mstate->dtms_difo->dtdo_strlen)) {
1161                 DTRACE_RANGE_REMAIN(remain, addr,
1162                         mstate->dtms_difo->dtdo_strtab,
1163                         mstate->dtms_difo->dtdo_strlen);
1164                 return (1);
1165         }
1166
1167         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1168         *illval = addr;
1169         return (0);
1170 }
1171
1172 /*
1173  * Convenience routine to check to see if a given string is within a memory
1174  * region in which a load may be issued given the user's privilege level;
1175  * this exists so that we don't need to issue unnecessary dtrace_strlen()
1176  * calls in the event that the user has all privileges.
1177  */
1178 static int
1179 dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1180         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1181 {
1182         size_t rsize;
1183
1184         /*
1185          * If we hold the privilege to read from kernel memory, then
1186          * everything is readable.
1187          */
1188         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1189                 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1190                 return (1);
1191         }
1192
1193         /*
1194          * Even if the caller is uninterested in querying the remaining valid
1195          * range, it is required to ensure that the access is allowed.
1196          */
1197         if (remain == NULL) {
1198                 remain = &rsize;
1199         }
1200         if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
1201                 size_t strsz;
1202                 /*
1203                  * Perform the strlen after determining the length of the
1204                  * memory region which is accessible.  This prevents timing
1205                  * information from being used to find NULs in memory which is
1206                  * not accessible to the caller.
1207                  */
1208                 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
1209                         MIN(sz, *remain));
1210                 if (strsz <= *remain) {
1211                         return (1);
1212                 }
1213         }
1214
1215         return (0);
1216 }
1217
1218 /*
1219  * Convenience routine to check to see if a given variable is within a memory
1220  * region in which a load may be issued given the user's privilege level.
1221  */
1222 static int
1223 dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
1224         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1225 {
1226         size_t sz;
1227         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1228
1229         /*
1230          * Calculate the max size before performing any checks since even
1231          * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1232          * return the max length via 'remain'.
1233          */
1234         if (type->dtdt_kind == DIF_TYPE_STRING) {
1235                 dtrace_state_t *state = vstate->dtvs_state;
1236
1237                 if (state != NULL) {
1238                         sz = state->dts_options[DTRACEOPT_STRSIZE];
1239                 } else {
1240                         /*
1241                          * In helper context, we have a NULL state; fall back
1242                          * to using the system-wide default for the string size
1243                          * in this case.
1244                          */
1245                         sz = dtrace_strsize_default;
1246                 }
1247         } else {
1248                 sz = type->dtdt_size;
1249         }
1250
1251         /*
1252          * If we hold the privilege to read from kernel memory, then
1253          * everything is readable.
1254          */
1255         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1256                 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1257                 return (1);
1258         }
1259
1260         if (type->dtdt_kind == DIF_TYPE_STRING) {
1261                 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1262                         vstate));
1263         }
1264         return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1265                 vstate));
1266 }
1267
1268 #define isdigit(ch)     ((ch) >= '0' && (ch) <= '9')
1269 #define islower(ch)     ((ch) >= 'a' && (ch) <= 'z')
1270 #define isspace(ch)     (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \
1271                         ((ch) == '\t') || ((ch) == '\f'))
1272 #define isxdigit(ch)    (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
1273                         ((ch) >= 'A' && (ch) <= 'F'))
1274 #define lisalnum(x)     \
1275         (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))
1276
1277 #define DIGIT(x)        \
1278         (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
1279
1280 /*
1281  * Convert a string to a signed integer using safe loads.
1282  */
1283 static int64_t
1284 dtrace_strtoll(char *input, int base, size_t limit)
1285 {
1286         uintptr_t pos = (uintptr_t)input;
1287         int64_t val = 0;
1288         int x;
1289         boolean_t neg = B_FALSE;
1290         char c, cc, ccc;
1291         uintptr_t end = pos + limit;
1292
1293         /*
1294          * Consume any whitespace preceding digits.
1295          */
1296         while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1297                 pos++;
1298
1299         /*
1300          * Handle an explicit sign if one is present.
1301          */
1302         if (c == '-' || c == '+') {
1303                 if (c == '-')
1304                         neg = B_TRUE;
1305                 c = dtrace_load8(++pos);
1306         }
1307
1308         /*
1309          * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1310          * if present.
1311          */
1312         if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1313             cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1314                 pos += 2;
1315                 c = ccc;
1316         }
1317
1318         /*
1319          * Read in contiguous digits until the first non-digit character.
1320          */
1321         for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1322             c = dtrace_load8(++pos))
1323                 val = val * base + x;
1324
1325         return (neg ? -val : val);
1326 }
1327
1328
1329 /*
1330  * Compare two strings using safe loads.
1331  */
1332 static int
1333 dtrace_strncmp(const char *s1, const char *s2, size_t limit)
1334 {
1335         uint8_t c1, c2;
1336         volatile uint16_t *flags;
1337
1338         if (s1 == s2 || limit == 0)
1339                 return (0);
1340
1341         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1342
1343         do {
1344                 if (s1 == NULL) {
1345                         c1 = '\0';
1346                 } else {
1347                         c1 = dtrace_load8((uintptr_t)s1++);
1348                 }
1349
1350                 if (s2 == NULL) {
1351                         c2 = '\0';
1352                 } else {
1353                         c2 = dtrace_load8((uintptr_t)s2++);
1354                 }
1355
1356                 if (c1 != c2)
1357                         return (c1 - c2);
1358         } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1359
1360         return (0);
1361 }
1362
1363 /*
1364  * Compute strlen(s) for a string using safe memory accesses.  The additional
1365  * len parameter is used to specify a maximum length to ensure completion.
1366  */
1367 static size_t
1368 dtrace_strlen(const char *s, size_t lim)
1369 {
1370         uint_t len;
1371
1372         for (len = 0; len != lim; len++) {
1373                 if (dtrace_load8((uintptr_t)s++) == '\0')
1374                         break;
1375         }
1376
1377         return (len);
1378 }
1379
1380 /*
1381  * Check if an address falls within a toxic region.
1382  */
1383 static int
1384 dtrace_istoxic(uintptr_t kaddr, size_t size)
1385 {
1386         uintptr_t taddr, tsize;
1387         int i;
1388
1389         for (i = 0; i < dtrace_toxranges; i++) {
1390                 taddr = dtrace_toxrange[i].dtt_base;
1391                 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1392
1393                 if (kaddr - taddr < tsize) {
1394                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1395                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1396                         return (1);
1397                 }
1398
1399                 if (taddr - kaddr < size) {
1400                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1401                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1402                         return (1);
1403                 }
1404         }
1405
1406         return (0);
1407 }
1408
1409 /*
1410  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1411  * memory specified by the DIF program.  The dst is assumed to be safe memory
1412  * that we can store to directly because it is managed by DTrace.  As with
1413  * standard bcopy, overlapping copies are handled properly.
1414  */
1415 static void
1416 dtrace_bcopy(const void *src, void *dst, size_t len)
1417 {
1418         if (len != 0) {
1419                 uint8_t *s1 = dst;
1420                 const uint8_t *s2 = src;
1421
1422                 if (s1 <= s2) {
1423                         do {
1424                                 *s1++ = dtrace_load8((uintptr_t)s2++);
1425                         } while (--len != 0);
1426                 } else {
1427                         s2 += len;
1428                         s1 += len;
1429
1430                         do {
1431                                 *--s1 = dtrace_load8((uintptr_t)--s2);
1432                         } while (--len != 0);
1433                 }
1434         }
1435 }
1436
1437 /*
1438  * Copy src to dst using safe memory accesses, up to either the specified
1439  * length, or the point that a nul byte is encountered.  The src is assumed to
1440  * be unsafe memory specified by the DIF program.  The dst is assumed to be
1441  * safe memory that we can store to directly because it is managed by DTrace.
1442  * Unlike dtrace_bcopy(), overlapping regions are not handled.
1443  */
1444 static void
1445 dtrace_strcpy(const void *src, void *dst, size_t len)
1446 {
1447         if (len != 0) {
1448                 uint8_t *s1 = dst, c;
1449                 const uint8_t *s2 = src;
1450
1451                 do {
1452                         *s1++ = c = dtrace_load8((uintptr_t)s2++);
1453                 } while (--len != 0 && c != '\0');
1454         }
1455 }
1456
1457 /*
1458  * Copy src to dst, deriving the size and type from the specified (BYREF)
1459  * variable type.  The src is assumed to be unsafe memory specified by the DIF
1460  * program.  The dst is assumed to be DTrace variable memory that is of the
1461  * specified type; we assume that we can store to directly.
1462  */
1463 static void
1464 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1465 {
1466         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1467
1468         if (type->dtdt_kind == DIF_TYPE_STRING) {
1469                 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1470         } else {
1471                 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1472         }
1473 }
1474
1475 /*
1476  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1477  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1478  * safe memory that we can access directly because it is managed by DTrace.
1479  */
1480 static int
1481 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1482 {
1483         volatile uint16_t *flags;
1484
1485         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1486
1487         if (s1 == s2)
1488                 return (0);
1489
1490         if (s1 == NULL || s2 == NULL)
1491                 return (1);
1492
1493         if (s1 != s2 && len != 0) {
1494                 const uint8_t *ps1 = s1;
1495                 const uint8_t *ps2 = s2;
1496
1497                 do {
1498                         if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1499                                 return (1);
1500                 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1501         }
1502         return (0);
1503 }
1504
1505 /*
1506  * Zero the specified region using a simple byte-by-byte loop.  Note that this
1507  * is for safe DTrace-managed memory only.
1508  */
1509 static void
1510 dtrace_bzero(void *dst, size_t len)
1511 {
1512         uchar_t *cp;
1513
1514         for (cp = dst; len != 0; len--)
1515                 *cp++ = 0;
1516 }
1517
1518 static void
1519 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1520 {
1521         uint64_t result[2];
1522
1523         result[0] = addend1[0] + addend2[0];
1524         result[1] = addend1[1] + addend2[1] +
1525             (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1526
1527         sum[0] = result[0];
1528         sum[1] = result[1];
1529 }
1530
1531 /*
1532  * Shift the 128-bit value in a by b. If b is positive, shift left.
1533  * If b is negative, shift right.
1534  */
1535 static void
1536 dtrace_shift_128(uint64_t *a, int b)
1537 {
1538         uint64_t mask;
1539
1540         if (b == 0)
1541                 return;
1542
1543         if (b < 0) {
1544                 b = -b;
1545                 if (b >= 64) {
1546                         a[0] = a[1] >> (b - 64);
1547                         a[1] = 0;
1548                 } else {
1549                         a[0] >>= b;
1550                         mask = 1LL << (64 - b);
1551                         mask -= 1;
1552                         a[0] |= ((a[1] & mask) << (64 - b));
1553                         a[1] >>= b;
1554                 }
1555         } else {
1556                 if (b >= 64) {
1557                         a[1] = a[0] << (b - 64);
1558                         a[0] = 0;
1559                 } else {
1560                         a[1] <<= b;
1561                         mask = a[0] >> (64 - b);
1562                         a[1] |= mask;
1563                         a[0] <<= b;
1564                 }
1565         }
1566 }
1567
1568 /*
1569  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1570  * use native multiplication on those, and then re-combine into the
1571  * resulting 128-bit value.
1572  *
1573  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1574  *     hi1 * hi2 << 64 +
1575  *     hi1 * lo2 << 32 +
1576  *     hi2 * lo1 << 32 +
1577  *     lo1 * lo2
1578  */
1579 static void
1580 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1581 {
1582         uint64_t hi1, hi2, lo1, lo2;
1583         uint64_t tmp[2];
1584
1585         hi1 = factor1 >> 32;
1586         hi2 = factor2 >> 32;
1587
1588         lo1 = factor1 & DT_MASK_LO;
1589         lo2 = factor2 & DT_MASK_LO;
1590
1591         product[0] = lo1 * lo2;
1592         product[1] = hi1 * hi2;
1593
1594         tmp[0] = hi1 * lo2;
1595         tmp[1] = 0;
1596         dtrace_shift_128(tmp, 32);
1597         dtrace_add_128(product, tmp, product);
1598
1599         tmp[0] = hi2 * lo1;
1600         tmp[1] = 0;
1601         dtrace_shift_128(tmp, 32);
1602         dtrace_add_128(product, tmp, product);
1603 }
1604
1605 /*
1606  * This privilege check should be used by actions and subroutines to
1607  * verify that the user credentials of the process that enabled the
1608  * invoking ECB match the target credentials
1609  */
1610 static int
1611 dtrace_priv_proc_common_user(dtrace_state_t *state)
1612 {
1613         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1614
1615         /*
1616          * We should always have a non-NULL state cred here, since if cred
1617          * is null (anonymous tracing), we fast-path bypass this routine.
1618          */
1619         ASSERT(s_cr != NULL);
1620
1621         if ((cr = dtrace_CRED()) != NULL &&
1622             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1623             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1624             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1625             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1626             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1627             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1628                 return (1);
1629
1630         return (0);
1631 }
1632
1633 /*
1634  * This privilege check should be used by actions and subroutines to
1635  * verify that the zone of the process that enabled the invoking ECB
1636  * matches the target credentials
1637  */
1638 static int
1639 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1640 {
1641         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1642 #pragma unused(cr, s_cr, state) /* __APPLE__ */
1643
1644         /*
1645          * We should always have a non-NULL state cred here, since if cred
1646          * is null (anonymous tracing), we fast-path bypass this routine.
1647          */
1648         ASSERT(s_cr != NULL);
1649
1650         return 1; /* APPLE NOTE: Darwin doesn't do zones. */
1651 }
1652
1653 /*
1654  * This privilege check should be used by actions and subroutines to
1655  * verify that the process has not setuid or changed credentials.
1656  */
1657 static int
1658 dtrace_priv_proc_common_nocd(void)
1659 {
1660         return 1; /* Darwin omits "No Core Dump" flag. */
1661 }
1662
1663 static int
1664 dtrace_priv_proc_destructive(dtrace_state_t *state)
1665 {
1666         int action = state->dts_cred.dcr_action;
1667
1668         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1669                 goto bad;
1670
1671         if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1672                 goto bad;
1673
1674         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1675             dtrace_priv_proc_common_zone(state) == 0)
1676                 goto bad;
1677
1678         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1679             dtrace_priv_proc_common_user(state) == 0)
1680                 goto bad;
1681
1682         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1683             dtrace_priv_proc_common_nocd() == 0)
1684                 goto bad;
1685
1686         return (1);
1687
1688 bad:
1689         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1690
1691         return (0);
1692 }
1693
1694 static int
1695 dtrace_priv_proc_control(dtrace_state_t *state)
1696 {
1697         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1698                 goto bad;
1699
1700         if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1701                 goto bad;
1702
1703         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1704                 return (1);
1705
1706         if (dtrace_priv_proc_common_zone(state) &&
1707             dtrace_priv_proc_common_user(state) &&
1708             dtrace_priv_proc_common_nocd())
1709                 return (1);
1710
1711 bad:
1712         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1713
1714         return (0);
1715 }
1716
1717 static int
1718 dtrace_priv_proc(dtrace_state_t *state)
1719 {
1720         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1721                 goto bad;
1722
1723         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc()))
1724                 goto bad;
1725
1726         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1727                 return (1);
1728
1729 bad:
1730         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1731
1732         return (0);
1733 }
1734
1735 /*
1736  * The P_LNOATTACH check is an Apple specific check.
1737  * We need a version of dtrace_priv_proc() that omits
1738  * that check for PID and EXECNAME accesses
1739  */
1740 static int
1741 dtrace_priv_proc_relaxed(dtrace_state_t *state)
1742 {
1743
1744         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1745                 return (1);
1746
1747         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1748
1749         return (0);
1750 }
1751
1752 static int
1753 dtrace_priv_kernel(dtrace_state_t *state)
1754 {
1755         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
1756                 goto bad;
1757
1758         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1759                 return (1);
1760
1761 bad:
1762         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1763
1764         return (0);
1765 }
1766
1767 static int
1768 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1769 {
1770         if (dtrace_is_restricted())
1771                 goto bad;
1772
1773         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1774                 return (1);
1775
1776 bad:
1777         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1778
1779         return (0);
1780 }
1781
1782 /*
1783  * Note:  not called from probe context.  This function is called
1784  * asynchronously (and at a regular interval) from outside of probe context to
1785  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1786  * cleaning is explained in detail in <sys/dtrace_impl.h>.
1787  */
1788 static void
1789 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1790 {
1791         dtrace_dynvar_t *dirty;
1792         dtrace_dstate_percpu_t *dcpu;
1793         int i, work = 0;
1794
1795         for (i = 0; i < (int)NCPU; i++) {
1796                 dcpu = &dstate->dtds_percpu[i];
1797
1798                 ASSERT(dcpu->dtdsc_rinsing == NULL);
1799
1800                 /*
1801                  * If the dirty list is NULL, there is no dirty work to do.
1802                  */
1803                 if (dcpu->dtdsc_dirty == NULL)
1804                         continue;
1805
1806                 /*
1807                  * If the clean list is non-NULL, then we're not going to do
1808                  * any work for this CPU -- it means that there has not been
1809                  * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1810                  * since the last time we cleaned house.
1811                  */
1812                 if (dcpu->dtdsc_clean != NULL)
1813                         continue;
1814
1815                 work = 1;
1816
1817                 /*
1818                  * Atomically move the dirty list aside.
1819                  */
1820                 do {
1821                         dirty = dcpu->dtdsc_dirty;
1822
1823                         /*
1824                          * Before we zap the dirty list, set the rinsing list.
1825                          * (This allows for a potential assertion in
1826                          * dtrace_dynvar():  if a free dynamic variable appears
1827                          * on a hash chain, either the dirty list or the
1828                          * rinsing list for some CPU must be non-NULL.)
1829                          */
1830                         dcpu->dtdsc_rinsing = dirty;
1831                         dtrace_membar_producer();
1832                 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1833                     dirty, NULL) != dirty);
1834         }
1835
1836         if (!work) {
1837                 /*
1838                  * We have no work to do; we can simply return.
1839                  */
1840                 return;
1841         }
1842
1843         dtrace_sync();
1844
1845         for (i = 0; i < (int)NCPU; i++) {
1846                 dcpu = &dstate->dtds_percpu[i];
1847
1848                 if (dcpu->dtdsc_rinsing == NULL)
1849                         continue;
1850
1851                 /*
1852                  * We are now guaranteed that no hash chain contains a pointer
1853                  * into this dirty list; we can make it clean.
1854                  */
1855                 ASSERT(dcpu->dtdsc_clean == NULL);
1856                 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1857                 dcpu->dtdsc_rinsing = NULL;
1858         }
1859
1860         /*
1861          * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1862          * sure that all CPUs have seen all of the dtdsc_clean pointers.
1863          * This prevents a race whereby a CPU incorrectly decides that
1864          * the state should be something other than DTRACE_DSTATE_CLEAN
1865          * after dtrace_dynvar_clean() has completed.
1866          */
1867         dtrace_sync();
1868
1869         dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1870 }
1871
1872 /*
1873  * Depending on the value of the op parameter, this function looks-up,
1874  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1875  * allocation is requested, this function will return a pointer to a
1876  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1877  * variable can be allocated.  If NULL is returned, the appropriate counter
1878  * will be incremented.
1879  */
1880 static dtrace_dynvar_t *
1881 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1882     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1883     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1884 {
1885         uint64_t hashval = DTRACE_DYNHASH_VALID;
1886         dtrace_dynhash_t *hash = dstate->dtds_hash;
1887         dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1888         processorid_t me = CPU->cpu_id, cpu = me;
1889         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1890         size_t bucket, ksize;
1891         size_t chunksize = dstate->dtds_chunksize;
1892         uintptr_t kdata, lock, nstate;
1893         uint_t i;
1894
1895         ASSERT(nkeys != 0);
1896
1897         /*
1898          * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1899          * algorithm.  For the by-value portions, we perform the algorithm in
1900          * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1901          * bit, and seems to have only a minute effect on distribution.  For
1902          * the by-reference data, we perform "One-at-a-time" iterating (safely)
1903          * over each referenced byte.  It's painful to do this, but it's much
1904          * better than pathological hash distribution.  The efficacy of the
1905          * hashing algorithm (and a comparison with other algorithms) may be
1906          * found by running the ::dtrace_dynstat MDB dcmd.
1907          */
1908         for (i = 0; i < nkeys; i++) {
1909                 if (key[i].dttk_size == 0) {
1910                         uint64_t val = key[i].dttk_value;
1911
1912                         hashval += (val >> 48) & 0xffff;
1913                         hashval += (hashval << 10);
1914                         hashval ^= (hashval >> 6);
1915
1916                         hashval += (val >> 32) & 0xffff;
1917                         hashval += (hashval << 10);
1918                         hashval ^= (hashval >> 6);
1919
1920                         hashval += (val >> 16) & 0xffff;
1921                         hashval += (hashval << 10);
1922                         hashval ^= (hashval >> 6);
1923
1924                         hashval += val & 0xffff;
1925                         hashval += (hashval << 10);
1926                         hashval ^= (hashval >> 6);
1927                 } else {
1928                         /*
1929                          * This is incredibly painful, but it beats the hell
1930                          * out of the alternative.
1931                          */
1932                         uint64_t j, size = key[i].dttk_size;
1933                         uintptr_t base = (uintptr_t)key[i].dttk_value;
1934
1935                         if (!dtrace_canload(base, size, mstate, vstate))
1936                                 break;
1937
1938                         for (j = 0; j < size; j++) {
1939                                 hashval += dtrace_load8(base + j);
1940                                 hashval += (hashval << 10);
1941                                 hashval ^= (hashval >> 6);
1942                         }
1943                 }
1944         }
1945
1946         if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1947                 return (NULL);
1948
1949         hashval += (hashval << 3);
1950         hashval ^= (hashval >> 11);
1951         hashval += (hashval << 15);
1952
1953         /*
1954          * There is a remote chance (ideally, 1 in 2^31) that our hashval
1955          * comes out to be one of our two sentinel hash values.  If this
1956          * actually happens, we set the hashval to be a value known to be a
1957          * non-sentinel value.
1958          */
1959         if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1960                 hashval = DTRACE_DYNHASH_VALID;
1961
1962         /*
1963          * Yes, it's painful to do a divide here.  If the cycle count becomes
1964          * important here, tricks can be pulled to reduce it.  (However, it's
1965          * critical that hash collisions be kept to an absolute minimum;
1966          * they're much more painful than a divide.)  It's better to have a
1967          * solution that generates few collisions and still keeps things
1968          * relatively simple.
1969          */
1970         bucket = hashval % dstate->dtds_hashsize;
1971
1972         if (op == DTRACE_DYNVAR_DEALLOC) {
1973                 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1974
1975                 for (;;) {
1976                         while ((lock = *lockp) & 1)
1977                                 continue;
1978
1979                         if (dtrace_casptr((void *)(uintptr_t)lockp,
1980                             (void *)lock, (void *)(lock + 1)) == (void *)lock)
1981                                 break;
1982                 }
1983
1984                 dtrace_membar_producer();
1985         }
1986
1987 top:
1988         prev = NULL;
1989         lock = hash[bucket].dtdh_lock;
1990
1991         dtrace_membar_consumer();
1992
1993         start = hash[bucket].dtdh_chain;
1994         ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1995             start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1996             op != DTRACE_DYNVAR_DEALLOC));
1997
1998         for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1999                 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
2000                 dtrace_key_t *dkey = &dtuple->dtt_key[0];
2001
2002                 if (dvar->dtdv_hashval != hashval) {
2003                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
2004                                 /*
2005                                  * We've reached the sink, and therefore the
2006                                  * end of the hash chain; we can kick out of
2007                                  * the loop knowing that we have seen a valid
2008                                  * snapshot of state.
2009                                  */
2010                                 ASSERT(dvar->dtdv_next == NULL);
2011                                 ASSERT(dvar == &dtrace_dynhash_sink);
2012                                 break;
2013                         }
2014
2015                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
2016                                 /*
2017                                  * We've gone off the rails:  somewhere along
2018                                  * the line, one of the members of this hash
2019                                  * chain was deleted.  Note that we could also
2020                                  * detect this by simply letting this loop run
2021                                  * to completion, as we would eventually hit
2022                                  * the end of the dirty list.  However, we
2023                                  * want to avoid running the length of the
2024                                  * dirty list unnecessarily (it might be quite
2025                                  * long), so we catch this as early as
2026                                  * possible by detecting the hash marker.  In
2027                                  * this case, we simply set dvar to NULL and
2028                                  * break; the conditional after the loop will
2029                                  * send us back to top.
2030                                  */
2031                                 dvar = NULL;
2032                                 break;
2033                         }
2034
2035                         goto next;
2036                 }
2037
2038                 if (dtuple->dtt_nkeys != nkeys)
2039                         goto next;
2040
2041                 for (i = 0; i < nkeys; i++, dkey++) {
2042                         if (dkey->dttk_size != key[i].dttk_size)
2043                                 goto next; /* size or type mismatch */
2044
2045                         if (dkey->dttk_size != 0) {
2046                                 if (dtrace_bcmp(
2047                                     (void *)(uintptr_t)key[i].dttk_value,
2048                                     (void *)(uintptr_t)dkey->dttk_value,
2049                                     dkey->dttk_size))
2050                                         goto next;
2051                         } else {
2052                                 if (dkey->dttk_value != key[i].dttk_value)
2053                                         goto next;
2054                         }
2055                 }
2056
2057                 if (op != DTRACE_DYNVAR_DEALLOC)
2058                         return (dvar);
2059
2060                 ASSERT(dvar->dtdv_next == NULL ||
2061                     dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
2062
2063                 if (prev != NULL) {
2064                         ASSERT(hash[bucket].dtdh_chain != dvar);
2065                         ASSERT(start != dvar);
2066                         ASSERT(prev->dtdv_next == dvar);
2067                         prev->dtdv_next = dvar->dtdv_next;
2068                 } else {
2069                         if (dtrace_casptr(&hash[bucket].dtdh_chain,
2070                             start, dvar->dtdv_next) != start) {
2071                                 /*
2072                                  * We have failed to atomically swing the
2073                                  * hash table head pointer, presumably because
2074                                  * of a conflicting allocation on another CPU.
2075                                  * We need to reread the hash chain and try
2076                                  * again.
2077                                  */
2078                                 goto top;
2079                         }
2080                 }
2081
2082                 dtrace_membar_producer();
2083
2084                 /*
2085                  * Now set the hash value to indicate that it's free.
2086                  */
2087                 ASSERT(hash[bucket].dtdh_chain != dvar);
2088                 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2089
2090                 dtrace_membar_producer();
2091
2092                 /*
2093                  * Set the next pointer to point at the dirty list, and
2094                  * atomically swing the dirty pointer to the newly freed dvar.
2095                  */
2096                 do {
2097                         next = dcpu->dtdsc_dirty;
2098                         dvar->dtdv_next = next;
2099                 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2100
2101                 /*
2102                  * Finally, unlock this hash bucket.
2103                  */
2104                 ASSERT(hash[bucket].dtdh_lock == lock);
2105                 ASSERT(lock & 1);
2106                 hash[bucket].dtdh_lock++;
2107
2108                 return (NULL);
2109 next:
2110                 prev = dvar;
2111                 continue;
2112         }
2113
2114         if (dvar == NULL) {
2115                 /*
2116                  * If dvar is NULL, it is because we went off the rails:
2117                  * one of the elements that we traversed in the hash chain
2118                  * was deleted while we were traversing it.  In this case,
2119                  * we assert that we aren't doing a dealloc (deallocs lock
2120                  * the hash bucket to prevent themselves from racing with
2121                  * one another), and retry the hash chain traversal.
2122                  */
2123                 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2124                 goto top;
2125         }
2126
2127         if (op != DTRACE_DYNVAR_ALLOC) {
2128                 /*
2129                  * If we are not to allocate a new variable, we want to
2130                  * return NULL now.  Before we return, check that the value
2131                  * of the lock word hasn't changed.  If it has, we may have
2132                  * seen an inconsistent snapshot.
2133                  */
2134                 if (op == DTRACE_DYNVAR_NOALLOC) {
2135                         if (hash[bucket].dtdh_lock != lock)
2136                                 goto top;
2137                 } else {
2138                         ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2139                         ASSERT(hash[bucket].dtdh_lock == lock);
2140                         ASSERT(lock & 1);
2141                         hash[bucket].dtdh_lock++;
2142                 }
2143
2144                 return (NULL);
2145         }
2146
2147         /*
2148          * We need to allocate a new dynamic variable.  The size we need is the
2149          * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2150          * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2151          * the size of any referred-to data (dsize).  We then round the final
2152          * size up to the chunksize for allocation.
2153          */
2154         for (ksize = 0, i = 0; i < nkeys; i++)
2155                 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2156
2157         /*
2158          * This should be pretty much impossible, but could happen if, say,
2159          * strange DIF specified the tuple.  Ideally, this should be an
2160          * assertion and not an error condition -- but that requires that the
2161          * chunksize calculation in dtrace_difo_chunksize() be absolutely
2162          * bullet-proof.  (That is, it must not be able to be fooled by
2163          * malicious DIF.)  Given the lack of backwards branches in DIF,
2164          * solving this would presumably not amount to solving the Halting
2165          * Problem -- but it still seems awfully hard.
2166          */
2167         if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2168             ksize + dsize > chunksize) {
2169                 dcpu->dtdsc_drops++;
2170                 return (NULL);
2171         }
2172
2173         nstate = DTRACE_DSTATE_EMPTY;
2174
2175         do {
2176 retry:
2177                 free = dcpu->dtdsc_free;
2178
2179                 if (free == NULL) {
2180                         dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2181                         void *rval;
2182
2183                         if (clean == NULL) {
2184                                 /*
2185                                  * We're out of dynamic variable space on
2186                                  * this CPU.  Unless we have tried all CPUs,
2187                                  * we'll try to allocate from a different
2188                                  * CPU.
2189                                  */
2190                                 switch (dstate->dtds_state) {
2191                                 case DTRACE_DSTATE_CLEAN: {
2192                                         void *sp = &dstate->dtds_state;
2193
2194                                         if (++cpu >= (int)NCPU)
2195                                                 cpu = 0;
2196
2197                                         if (dcpu->dtdsc_dirty != NULL &&
2198                                             nstate == DTRACE_DSTATE_EMPTY)
2199                                                 nstate = DTRACE_DSTATE_DIRTY;
2200
2201                                         if (dcpu->dtdsc_rinsing != NULL)
2202                                                 nstate = DTRACE_DSTATE_RINSING;
2203
2204                                         dcpu = &dstate->dtds_percpu[cpu];
2205
2206                                         if (cpu != me)
2207                                                 goto retry;
2208
2209                                         (void) dtrace_cas32(sp,
2210                                             DTRACE_DSTATE_CLEAN, nstate);
2211
2212                                         /*
2213                                          * To increment the correct bean
2214                                          * counter, take another lap.
2215                                          */
2216                                         goto retry;
2217                                 }
2218
2219                                 case DTRACE_DSTATE_DIRTY:
2220                                         dcpu->dtdsc_dirty_drops++;
2221                                         break;
2222
2223                                 case DTRACE_DSTATE_RINSING:
2224                                         dcpu->dtdsc_rinsing_drops++;
2225                                         break;
2226
2227                                 case DTRACE_DSTATE_EMPTY:
2228                                         dcpu->dtdsc_drops++;
2229                                         break;
2230                                 }
2231
2232                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2233                                 return (NULL);
2234                         }
2235
2236                         /*
2237                          * The clean list appears to be non-empty.  We want to
2238                          * move the clean list to the free list; we start by
2239                          * moving the clean pointer aside.
2240                          */
2241                         if (dtrace_casptr(&dcpu->dtdsc_clean,
2242                             clean, NULL) != clean) {
2243                                 /*
2244                                  * We are in one of two situations:
2245                                  *
2246                                  *  (a) The clean list was switched to the
2247                                  *      free list by another CPU.
2248                                  *
2249                                  *  (b) The clean list was added to by the
2250                                  *      cleansing cyclic.
2251                                  *
2252                                  * In either of these situations, we can
2253                                  * just reattempt the free list allocation.
2254                                  */
2255                                 goto retry;
2256                         }
2257
2258                         ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2259
2260                         /*
2261                          * Now we'll move the clean list to the free list.
2262                          * It's impossible for this to fail:  the only way
2263                          * the free list can be updated is through this
2264                          * code path, and only one CPU can own the clean list.
2265                          * Thus, it would only be possible for this to fail if
2266                          * this code were racing with dtrace_dynvar_clean().
2267                          * (That is, if dtrace_dynvar_clean() updated the clean
2268                          * list, and we ended up racing to update the free
2269                          * list.)  This race is prevented by the dtrace_sync()
2270                          * in dtrace_dynvar_clean() -- which flushes the
2271                          * owners of the clean lists out before resetting
2272                          * the clean lists.
2273                          */
2274                         rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2275                         ASSERT(rval == NULL);
2276                         goto retry;
2277                 }
2278
2279                 dvar = free;
2280                 new_free = dvar->dtdv_next;
2281         } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2282
2283         /*
2284          * We have now allocated a new chunk.  We copy the tuple keys into the
2285          * tuple array and copy any referenced key data into the data space
2286          * following the tuple array.  As we do this, we relocate dttk_value
2287          * in the final tuple to point to the key data address in the chunk.
2288          */
2289         kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2290         dvar->dtdv_data = (void *)(kdata + ksize);
2291         dvar->dtdv_tuple.dtt_nkeys = nkeys;
2292
2293         for (i = 0; i < nkeys; i++) {
2294                 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2295                 size_t kesize = key[i].dttk_size;
2296
2297                 if (kesize != 0) {
2298                         dtrace_bcopy(
2299                             (const void *)(uintptr_t)key[i].dttk_value,
2300                             (void *)kdata, kesize);
2301                         dkey->dttk_value = kdata;
2302                         kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2303                 } else {
2304                         dkey->dttk_value = key[i].dttk_value;
2305                 }
2306
2307                 dkey->dttk_size = kesize;
2308         }
2309
2310         ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2311         dvar->dtdv_hashval = hashval;
2312         dvar->dtdv_next = start;
2313
2314         if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2315                 return (dvar);
2316
2317         /*
2318          * The cas has failed.  Either another CPU is adding an element to
2319          * this hash chain, or another CPU is deleting an element from this
2320          * hash chain.  The simplest way to deal with both of these cases
2321          * (though not necessarily the most efficient) is to free our
2322          * allocated block and tail-call ourselves.  Note that the free is
2323          * to the dirty list and _not_ to the free list.  This is to prevent
2324          * races with allocators, above.
2325          */
2326         dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2327
2328         dtrace_membar_producer();
2329
2330         do {
2331                 free = dcpu->dtdsc_dirty;
2332                 dvar->dtdv_next = free;
2333         } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2334
2335         return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2336 }
2337
2338 /*ARGSUSED*/
2339 static void
2340 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2341 {
2342 #pragma unused(arg) /* __APPLE__ */
2343         if ((int64_t)nval < (int64_t)*oval)
2344                 *oval = nval;
2345 }
2346
2347 /*ARGSUSED*/
2348 static void
2349 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2350 {
2351 #pragma unused(arg) /* __APPLE__ */
2352         if ((int64_t)nval > (int64_t)*oval)
2353                 *oval = nval;
2354 }
2355
2356 static void
2357 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2358 {
2359         int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2360         int64_t val = (int64_t)nval;
2361
2362         if (val < 0) {
2363                 for (i = 0; i < zero; i++) {
2364                         if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2365                                 quanta[i] += incr;
2366                                 return;
2367                         }
2368                 }
2369         } else {
2370                 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2371                         if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2372                                 quanta[i - 1] += incr;
2373                                 return;
2374                         }
2375                 }
2376
2377                 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2378                 return;
2379         }
2380
2381         ASSERT(0);
2382 }
2383
2384 static void
2385 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2386 {
2387         uint64_t arg = *lquanta++;
2388         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2389         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2390         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2391         int32_t val = (int32_t)nval, level;
2392
2393         ASSERT(step != 0);
2394         ASSERT(levels != 0);
2395
2396         if (val < base) {
2397                 /*
2398                  * This is an underflow.
2399                  */
2400                 lquanta[0] += incr;
2401                 return;
2402         }
2403
2404         level = (val - base) / step;
2405
2406         if (level < levels) {
2407                 lquanta[level + 1] += incr;
2408                 return;
2409         }
2410
2411         /*
2412          * This is an overflow.
2413          */
2414         lquanta[levels + 1] += incr;
2415 }
2416
2417 static int
2418 dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2419                                    int16_t nsteps, int64_t value)
2420 {
2421         int64_t this = 1, last, next;
2422         int base = 1, order;
2423
2424         for (order = 0; order < low; ++order)
2425                 this *= factor;
2426
2427         /*
2428          * If our value is less than our factor taken to the power of the
2429          * low order of magnitude, it goes into the zeroth bucket.
2430          */
2431         if (value < this)
2432                 return 0;
2433         else
2434                 last = this;
2435
2436         for (this *= factor; order <= high; ++order) {
2437                 int nbuckets = this > nsteps ? nsteps : this;
2438
2439                 /*
2440                  * We should not generally get log/linear quantizations
2441                  * with a high magnitude that allows 64-bits to
2442                  * overflow, but we nonetheless protect against this
2443                  * by explicitly checking for overflow, and clamping
2444                  * our value accordingly.
2445                  */
2446                 next = this * factor;
2447                 if (next < this) {
2448                         value = this - 1;
2449                 }
2450
2451                 /*
2452                  * If our value lies within this order of magnitude,
2453                  * determine its position by taking the offset within
2454                  * the order of magnitude, dividing by the bucket
2455                  * width, and adding to our (accumulated) base.
2456                  */
2457                 if (value < this) {
2458                         return (base + (value - last) / (this / nbuckets));
2459                 }
2460
2461                 base += nbuckets - (nbuckets / factor);
2462                 last = this;
2463                 this = next;
2464         }
2465
2466         /*
2467          * Our value is greater than or equal to our factor taken to the
2468          * power of one plus the high magnitude -- return the top bucket.
2469          */
2470         return base;
2471 }
2472
2473 static void
2474 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2475 {
2476         uint64_t arg    = *llquanta++;
2477         uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2478         uint16_t low    = DTRACE_LLQUANTIZE_LOW(arg);
2479         uint16_t high   = DTRACE_LLQUANTIZE_HIGH(arg);
2480         uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2481
2482         llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2483 }
2484
2485 /*ARGSUSED*/
2486 static void
2487 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2488 {
2489 #pragma unused(arg) /* __APPLE__ */
2490         data[0]++;
2491         data[1] += nval;
2492 }
2493
2494 /*ARGSUSED*/
2495 static void
2496 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2497 {
2498 #pragma unused(arg) /* __APPLE__ */
2499         int64_t snval = (int64_t)nval;
2500         uint64_t tmp[2];
2501
2502         data[0]++;
2503         data[1] += nval;
2504
2505         /*
2506          * What we want to say here is:
2507          *
2508          * data[2] += nval * nval;
2509          *
2510          * But given that nval is 64-bit, we could easily overflow, so
2511          * we do this as 128-bit arithmetic.
2512          */
2513         if (snval < 0)
2514                 snval = -snval;
2515
2516         dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2517         dtrace_add_128(data + 2, tmp, data + 2);
2518 }
2519
2520 /*ARGSUSED*/
2521 static void
2522 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2523 {
2524 #pragma unused(nval, arg) /* __APPLE__ */
2525         *oval = *oval + 1;
2526 }
2527
2528 /*ARGSUSED*/
2529 static void
2530 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2531 {
2532 #pragma unused(arg) /* __APPLE__ */
2533         *oval += nval;
2534 }
2535
2536 /*
2537  * Aggregate given the tuple in the principal data buffer, and the aggregating
2538  * action denoted by the specified dtrace_aggregation_t.  The aggregation
2539  * buffer is specified as the buf parameter.  This routine does not return
2540  * failure; if there is no space in the aggregation buffer, the data will be
2541  * dropped, and a corresponding counter incremented.
2542  */
2543 static void
2544 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2545     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2546 {
2547 #pragma unused(arg)
2548         dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2549         uint32_t i, ndx, size, fsize;
2550         uint32_t align = sizeof (uint64_t) - 1;
2551         dtrace_aggbuffer_t *agb;
2552         dtrace_aggkey_t *key;
2553         uint32_t hashval = 0, limit, isstr;
2554         caddr_t tomax, data, kdata;
2555         dtrace_actkind_t action;
2556         dtrace_action_t *act;
2557         uintptr_t offs;
2558
2559         if (buf == NULL)
2560                 return;
2561
2562         if (!agg->dtag_hasarg) {
2563                 /*
2564                  * Currently, only quantize() and lquantize() take additional
2565                  * arguments, and they have the same semantics:  an increment
2566                  * value that defaults to 1 when not present.  If additional
2567                  * aggregating actions take arguments, the setting of the
2568                  * default argument value will presumably have to become more
2569                  * sophisticated...
2570                  */
2571                 arg = 1;
2572         }
2573
2574         action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2575         size = rec->dtrd_offset - agg->dtag_base;
2576         fsize = size + rec->dtrd_size;
2577
2578         ASSERT(dbuf->dtb_tomax != NULL);
2579         data = dbuf->dtb_tomax + offset + agg->dtag_base;
2580
2581         if ((tomax = buf->dtb_tomax) == NULL) {
2582                 dtrace_buffer_drop(buf);
2583                 return;
2584         }
2585
2586         /*
2587          * The metastructure is always at the bottom of the buffer.
2588          */
2589         agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2590             sizeof (dtrace_aggbuffer_t));
2591
2592         if (buf->dtb_offset == 0) {
2593                 /*
2594                  * We just kludge up approximately 1/8th of the size to be
2595                  * buckets.  If this guess ends up being routinely
2596                  * off-the-mark, we may need to dynamically readjust this
2597                  * based on past performance.
2598                  */
2599                 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2600
2601                 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2602                     (uintptr_t)tomax || hashsize == 0) {
2603                         /*
2604                          * We've been given a ludicrously small buffer;
2605                          * increment our drop count and leave.
2606                          */
2607                         dtrace_buffer_drop(buf);
2608                         return;
2609                 }
2610
2611                 /*
2612                  * And now, a pathetic attempt to try to get a an odd (or
2613                  * perchance, a prime) hash size for better hash distribution.
2614                  */
2615                 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2616                         hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2617
2618                 agb->dtagb_hashsize = hashsize;
2619                 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2620                     agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2621                 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2622
2623                 for (i = 0; i < agb->dtagb_hashsize; i++)
2624                         agb->dtagb_hash[i] = NULL;
2625         }
2626
2627         ASSERT(agg->dtag_first != NULL);
2628         ASSERT(agg->dtag_first->dta_intuple);
2629
2630         /*
2631          * Calculate the hash value based on the key.  Note that we _don't_
2632          * include the aggid in the hashing (but we will store it as part of
2633          * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2634          * algorithm: a simple, quick algorithm that has no known funnels, and
2635          * gets good distribution in practice.  The efficacy of the hashing
2636          * algorithm (and a comparison with other algorithms) may be found by
2637          * running the ::dtrace_aggstat MDB dcmd.
2638          */
2639         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2640                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2641                 limit = i + act->dta_rec.dtrd_size;
2642                 ASSERT(limit <= size);
2643                 isstr = DTRACEACT_ISSTRING(act);
2644
2645                 for (; i < limit; i++) {
2646                         hashval += data[i];
2647                         hashval += (hashval << 10);
2648                         hashval ^= (hashval >> 6);
2649
2650                         if (isstr && data[i] == '\0')
2651                                 break;
2652                 }
2653         }
2654
2655         hashval += (hashval << 3);
2656         hashval ^= (hashval >> 11);
2657         hashval += (hashval << 15);
2658
2659         /*
2660          * Yes, the divide here is expensive -- but it's generally the least
2661          * of the performance issues given the amount of data that we iterate
2662          * over to compute hash values, compare data, etc.
2663          */
2664         ndx = hashval % agb->dtagb_hashsize;
2665
2666         for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2667                 ASSERT((caddr_t)key >= tomax);
2668                 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2669
2670                 if (hashval != key->dtak_hashval || key->dtak_size != size)
2671                         continue;
2672
2673                 kdata = key->dtak_data;
2674                 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2675
2676                 for (act = agg->dtag_first; act->dta_intuple;
2677                     act = act->dta_next) {
2678                         i = act->dta_rec.dtrd_offset - agg->dtag_base;
2679                         limit = i + act->dta_rec.dtrd_size;
2680                         ASSERT(limit <= size);
2681                         isstr = DTRACEACT_ISSTRING(act);
2682
2683                         for (; i < limit; i++) {
2684                                 if (kdata[i] != data[i])
2685                                         goto next;
2686
2687                                 if (isstr && data[i] == '\0')
2688                                         break;
2689                         }
2690                 }
2691
2692                 if (action != key->dtak_action) {
2693                         /*
2694                          * We are aggregating on the same value in the same
2695                          * aggregation with two different aggregating actions.
2696                          * (This should have been picked up in the compiler,
2697                          * so we may be dealing with errant or devious DIF.)
2698                          * This is an error condition; we indicate as much,
2699                          * and return.
2700                          */
2701                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2702                         return;
2703                 }
2704
2705                 /*
2706                  * This is a hit:  we need to apply the aggregator to
2707                  * the value at this key.
2708                  */
2709                 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2710                 return;
2711 next:
2712                 continue;
2713         }
2714
2715         /*
2716          * We didn't find it.  We need to allocate some zero-filled space,
2717          * link it into the hash table appropriately, and apply the aggregator
2718          * to the (zero-filled) value.
2719          */
2720         offs = buf->dtb_offset;
2721         while (offs & (align - 1))
2722                 offs += sizeof (uint32_t);
2723
2724         /*
2725          * If we don't have enough room to both allocate a new key _and_
2726          * its associated data, increment the drop count and return.
2727          */
2728         if ((uintptr_t)tomax + offs + fsize >
2729             agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2730                 dtrace_buffer_drop(buf);
2731                 return;
2732         }
2733
2734         /*CONSTCOND*/
2735         ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2736         key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2737         agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2738
2739         key->dtak_data = kdata = tomax + offs;
2740         buf->dtb_offset = offs + fsize;
2741
2742         /*
2743          * Now copy the data across.
2744          */
2745         *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2746
2747         for (i = sizeof (dtrace_aggid_t); i < size; i++)
2748                 kdata[i] = data[i];
2749
2750         /*
2751          * Because strings are not zeroed out by default, we need to iterate
2752          * looking for actions that store strings, and we need to explicitly
2753          * pad these strings out with zeroes.
2754          */
2755         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2756                 int nul;
2757
2758                 if (!DTRACEACT_ISSTRING(act))
2759                         continue;
2760
2761                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2762                 limit = i + act->dta_rec.dtrd_size;
2763                 ASSERT(limit <= size);
2764
2765                 for (nul = 0; i < limit; i++) {
2766                         if (nul) {
2767                                 kdata[i] = '\0';
2768                                 continue;
2769                         }
2770
2771                         if (data[i] != '\0')
2772                                 continue;
2773
2774                         nul = 1;
2775                 }
2776         }
2777
2778         for (i = size; i < fsize; i++)
2779                 kdata[i] = 0;
2780
2781         key->dtak_hashval = hashval;
2782         key->dtak_size = size;
2783         key->dtak_action = action;
2784         key->dtak_next = agb->dtagb_hash[ndx];
2785         agb->dtagb_hash[ndx] = key;
2786
2787         /*
2788          * Finally, apply the aggregator.
2789          */
2790         *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2791         agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2792 }
2793
2794 /*
2795  * Given consumer state, this routine finds a speculation in the INACTIVE
2796  * state and transitions it into the ACTIVE state.  If there is no speculation
2797  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2798  * incremented -- it is up to the caller to take appropriate action.
2799  */
2800 static int
2801 dtrace_speculation(dtrace_state_t *state)
2802 {
2803         int i = 0;
2804         dtrace_speculation_state_t current;
2805         uint32_t *stat = &state->dts_speculations_unavail, count;
2806
2807         while (i < state->dts_nspeculations) {
2808                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2809
2810                 current = spec->dtsp_state;
2811
2812                 if (current != DTRACESPEC_INACTIVE) {
2813                         if (current == DTRACESPEC_COMMITTINGMANY ||
2814                             current == DTRACESPEC_COMMITTING ||
2815                             current == DTRACESPEC_DISCARDING)
2816                                 stat = &state->dts_speculations_busy;
2817                         i++;
2818                         continue;
2819                 }
2820
2821                 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2822                     current, DTRACESPEC_ACTIVE) == current)
2823                         return (i + 1);
2824         }
2825
2826         /*
2827          * We couldn't find a speculation.  If we found as much as a single
2828          * busy speculation buffer, we'll attribute this failure as "busy"
2829          * instead of "unavail".
2830          */
2831         do {
2832                 count = *stat;
2833         } while (dtrace_cas32(stat, count, count + 1) != count);
2834
2835         return (0);
2836 }
2837
2838 /*
2839  * This routine commits an active speculation.  If the specified speculation
2840  * is not in a valid state to perform a commit(), this routine will silently do
2841  * nothing.  The state of the specified speculation is transitioned according
2842  * to the state transition diagram outlined in <sys/dtrace_impl.h>
2843  */
2844 static void
2845 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2846     dtrace_specid_t which)
2847 {
2848         dtrace_speculation_t *spec;
2849         dtrace_buffer_t *src, *dest;
2850         uintptr_t daddr, saddr, dlimit, slimit;
2851         dtrace_speculation_state_t current,  new = DTRACESPEC_INACTIVE;
2852         intptr_t offs;
2853         uint64_t timestamp;
2854
2855         if (which == 0)
2856                 return;
2857
2858         if (which > (dtrace_specid_t)state->dts_nspeculations) {
2859                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2860                 return;
2861         }
2862
2863         spec = &state->dts_speculations[which - 1];
2864         src = &spec->dtsp_buffer[cpu];
2865         dest = &state->dts_buffer[cpu];
2866
2867         do {
2868                 current = spec->dtsp_state;
2869
2870                 if (current == DTRACESPEC_COMMITTINGMANY)
2871                         break;
2872
2873                 switch (current) {
2874                 case DTRACESPEC_INACTIVE:
2875                 case DTRACESPEC_DISCARDING:
2876                         return;
2877
2878                 case DTRACESPEC_COMMITTING:
2879                         /*
2880                          * This is only possible if we are (a) commit()'ing
2881                          * without having done a prior speculate() on this CPU
2882                          * and (b) racing with another commit() on a different
2883                          * CPU.  There's nothing to do -- we just assert that
2884                          * our offset is 0.
2885                          */
2886                         ASSERT(src->dtb_offset == 0);
2887                         return;
2888
2889                 case DTRACESPEC_ACTIVE:
2890                         new = DTRACESPEC_COMMITTING;
2891                         break;
2892
2893                 case DTRACESPEC_ACTIVEONE:
2894                         /*
2895                          * This speculation is active on one CPU.  If our
2896                          * buffer offset is non-zero, we know that the one CPU
2897                          * must be us.  Otherwise, we are committing on a
2898                          * different CPU from the speculate(), and we must
2899                          * rely on being asynchronously cleaned.
2900                          */
2901                         if (src->dtb_offset != 0) {
2902                                 new = DTRACESPEC_COMMITTING;
2903                                 break;
2904                         }
2905                         /*FALLTHROUGH*/
2906
2907                 case DTRACESPEC_ACTIVEMANY:
2908                         new = DTRACESPEC_COMMITTINGMANY;
2909                         break;
2910
2911                 default:
2912                         ASSERT(0);
2913                 }
2914         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2915             current, new) != current);
2916
2917         /*
2918          * We have set the state to indicate that we are committing this
2919          * speculation.  Now reserve the necessary space in the destination
2920          * buffer.
2921          */
2922         if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2923             sizeof (uint64_t), state, NULL)) < 0) {
2924                 dtrace_buffer_drop(dest);
2925                 goto out;
2926         }
2927
2928         /*
2929          * We have sufficient space to copy the speculative buffer into the
2930          * primary buffer.  First, modify the speculative buffer, filling
2931          * in the timestamp of all entries with the current time.  The data
2932          * must have the commit() time rather than the time it was traced,
2933          * so that all entries in the primary buffer are in timestamp order.
2934          */
2935         timestamp = dtrace_gethrtime();
2936         saddr = (uintptr_t)src->dtb_tomax;
2937         slimit = saddr + src->dtb_offset;
2938         while (saddr < slimit) {
2939                 size_t size;
2940                 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2941
2942                 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2943                         saddr += sizeof (dtrace_epid_t);
2944                         continue;
2945                 }
2946
2947                 ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
2948                 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2949
2950                 ASSERT(saddr + size <= slimit);
2951                 ASSERT(size >= sizeof(dtrace_rechdr_t));
2952                 ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
2953
2954                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2955
2956                 saddr += size;
2957         }
2958
2959         /*
2960          * Copy the buffer across.  (Note that this is a
2961          * highly subobtimal bcopy(); in the unlikely event that this becomes
2962          * a serious performance issue, a high-performance DTrace-specific
2963          * bcopy() should obviously be invented.)
2964          */
2965         daddr = (uintptr_t)dest->dtb_tomax + offs;
2966         dlimit = daddr + src->dtb_offset;
2967         saddr = (uintptr_t)src->dtb_tomax;
2968
2969         /*
2970          * First, the aligned portion.
2971          */
2972         while (dlimit - daddr >= sizeof (uint64_t)) {
2973                 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2974
2975                 daddr += sizeof (uint64_t);
2976                 saddr += sizeof (uint64_t);
2977         }
2978
2979         /*
2980          * Now any left-over bit...
2981          */
2982         while (dlimit - daddr)
2983                 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2984
2985         /*
2986          * Finally, commit the reserved space in the destination buffer.
2987          */
2988         dest->dtb_offset = offs + src->dtb_offset;
2989
2990 out:
2991         /*
2992          * If we're lucky enough to be the only active CPU on this speculation
2993          * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2994          */
2995         if (current == DTRACESPEC_ACTIVE ||
2996             (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2997                 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2998                     DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2999 #pragma unused(rval) /* __APPLE__ */
3000
3001                 ASSERT(rval == DTRACESPEC_COMMITTING);
3002         }
3003
3004         src->dtb_offset = 0;
3005         src->dtb_xamot_drops += src->dtb_drops;
3006         src->dtb_drops = 0;
3007 }
3008
3009 /*
3010  * This routine discards an active speculation.  If the specified speculation
3011  * is not in a valid state to perform a discard(), this routine will silently
3012  * do nothing.  The state of the specified speculation is transitioned
3013  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
3014  */
3015 static void
3016 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
3017     dtrace_specid_t which)
3018 {
3019         dtrace_speculation_t *spec;
3020         dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3021         dtrace_buffer_t *buf;
3022
3023         if (which == 0)
3024                 return;
3025
3026         if (which > (dtrace_specid_t)state->dts_nspeculations) {
3027                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3028                 return;
3029         }
3030
3031         spec = &state->dts_speculations[which - 1];
3032         buf = &spec->dtsp_buffer[cpu];
3033
3034         do {
3035                 current = spec->dtsp_state;
3036
3037                 switch (current) {
3038                 case DTRACESPEC_INACTIVE:
3039                 case DTRACESPEC_COMMITTINGMANY:
3040                 case DTRACESPEC_COMMITTING:
3041                 case DTRACESPEC_DISCARDING:
3042                         return;
3043
3044                 case DTRACESPEC_ACTIVE:
3045                 case DTRACESPEC_ACTIVEMANY:
3046                         new = DTRACESPEC_DISCARDING;
3047                         break;
3048
3049                 case DTRACESPEC_ACTIVEONE:
3050                         if (buf->dtb_offset != 0) {
3051                                 new = DTRACESPEC_INACTIVE;
3052                         } else {
3053                                 new = DTRACESPEC_DISCARDING;
3054                         }
3055                         break;
3056
3057                 default:
3058                         ASSERT(0);
3059                 }
3060         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3061             current, new) != current);
3062
3063         buf->dtb_offset = 0;
3064         buf->dtb_drops = 0;
3065 }
3066
3067 /*
3068  * Note:  not called from probe context.  This function is called
3069  * asynchronously from cross call context to clean any speculations that are
3070  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
3071  * transitioned back to the INACTIVE state until all CPUs have cleaned the
3072  * speculation.
3073  */
3074 static void
3075 dtrace_speculation_clean_here(dtrace_state_t *state)
3076 {
3077         dtrace_icookie_t cookie;
3078         processorid_t cpu = CPU->cpu_id;
3079         dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3080         dtrace_specid_t i;
3081
3082         cookie = dtrace_interrupt_disable();
3083
3084         if (dest->dtb_tomax == NULL) {
3085                 dtrace_interrupt_enable(cookie);
3086                 return;
3087         }
3088
3089         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3090                 dtrace_speculation_t *spec = &state->dts_speculations[i];
3091                 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3092
3093                 if (src->dtb_tomax == NULL)
3094                         continue;
3095
3096                 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3097                         src->dtb_offset = 0;
3098                         continue;
3099                 }
3100
3101                 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3102                         continue;
3103
3104                 if (src->dtb_offset == 0)
3105                         continue;
3106
3107                 dtrace_speculation_commit(state, cpu, i + 1);
3108         }
3109
3110         dtrace_interrupt_enable(cookie);
3111 }
3112
3113 /*
3114  * Note:  not called from probe context.  This function is called
3115  * asynchronously (and at a regular interval) to clean any speculations that
3116  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
3117  * is work to be done, it cross calls all CPUs to perform that work;
3118  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
3119  * INACTIVE state until they have been cleaned by all CPUs.
3120  */
3121 static void
3122 dtrace_speculation_clean(dtrace_state_t *state)
3123 {
3124         int work = 0;
3125         uint32_t rv;
3126         dtrace_specid_t i;
3127
3128         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3129                 dtrace_speculation_t *spec = &state->dts_speculations[i];
3130
3131                 ASSERT(!spec->dtsp_cleaning);
3132
3133                 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3134                     spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3135                         continue;
3136
3137                 work++;
3138                 spec->dtsp_cleaning = 1;
3139         }
3140
3141         if (!work)
3142                 return;
3143
3144         dtrace_xcall(DTRACE_CPUALL,
3145             (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3146
3147         /*
3148          * We now know that all CPUs have committed or discarded their
3149          * speculation buffers, as appropriate.  We can now set the state
3150          * to inactive.
3151          */
3152         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3153                 dtrace_speculation_t *spec = &state->dts_speculations[i];
3154                 dtrace_speculation_state_t current, new;
3155
3156                 if (!spec->dtsp_cleaning)
3157                         continue;
3158
3159                 current = spec->dtsp_state;
3160                 ASSERT(current == DTRACESPEC_DISCARDING ||
3161                     current == DTRACESPEC_COMMITTINGMANY);
3162
3163                 new = DTRACESPEC_INACTIVE;
3164
3165                 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3166                 ASSERT(rv == current);
3167                 spec->dtsp_cleaning = 0;
3168         }
3169 }
3170
3171 /*
3172  * Called as part of a speculate() to get the speculative buffer associated
3173  * with a given speculation.  Returns NULL if the specified speculation is not
3174  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
3175  * the active CPU is not the specified CPU -- the speculation will be
3176  * atomically transitioned into the ACTIVEMANY state.
3177  */
3178 static dtrace_buffer_t *
3179 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3180     dtrace_specid_t which)
3181 {
3182         dtrace_speculation_t *spec;
3183         dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3184         dtrace_buffer_t *buf;
3185
3186         if (which == 0)
3187                 return (NULL);
3188
3189         if (which > (dtrace_specid_t)state->dts_nspeculations) {
3190                 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3191                 return (NULL);
3192         }
3193
3194         spec = &state->dts_speculations[which - 1];
3195         buf = &spec->dtsp_buffer[cpuid];
3196
3197         do {
3198                 current = spec->dtsp_state;
3199
3200                 switch (current) {
3201                 case DTRACESPEC_INACTIVE:
3202                 case DTRACESPEC_COMMITTINGMANY:
3203                 case DTRACESPEC_DISCARDING:
3204                         return (NULL);
3205
3206                 case DTRACESPEC_COMMITTING:
3207                         ASSERT(buf->dtb_offset == 0);
3208                         return (NULL);
3209
3210                 case DTRACESPEC_ACTIVEONE:
3211                         /*
3212                          * This speculation is currently active on one CPU.
3213                          * Check the offset in the buffer; if it's non-zero,
3214                          * that CPU must be us (and we leave the state alone).
3215                          * If it's zero, assume that we're starting on a new
3216                          * CPU -- and change the state to indicate that the
3217                          * speculation is active on more than one CPU.
3218                          */
3219                         if (buf->dtb_offset != 0)
3220                                 return (buf);
3221
3222                         new = DTRACESPEC_ACTIVEMANY;
3223                         break;
3224
3225                 case DTRACESPEC_ACTIVEMANY:
3226                         return (buf);
3227
3228                 case DTRACESPEC_ACTIVE:
3229                         new = DTRACESPEC_ACTIVEONE;
3230                         break;
3231
3232                 default:
3233                         ASSERT(0);
3234                 }
3235         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3236             current, new) != current);
3237
3238         ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3239         return (buf);
3240 }
3241
3242 /*
3243  * Return a string.  In the event that the user lacks the privilege to access
3244  * arbitrary kernel memory, we copy the string out to scratch memory so that we
3245  * don't fail access checking.
3246  *
3247  * dtrace_dif_variable() uses this routine as a helper for various
3248  * builtin values such as 'execname' and 'probefunc.'
3249  */
3250 static
3251 uintptr_t
3252 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3253     dtrace_mstate_t *mstate)
3254 {
3255         uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3256         uintptr_t ret;
3257         size_t strsz;
3258
3259         /*
3260          * The easy case: this probe is allowed to read all of memory, so
3261          * we can just return this as a vanilla pointer.
3262          */
3263         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3264                 return (addr);
3265
3266         /*
3267          * This is the tougher case: we copy the string in question from
3268          * kernel memory into scratch memory and return it that way: this
3269          * ensures that we won't trip up when access checking tests the
3270          * BYREF return value.
3271          */
3272         strsz = dtrace_strlen((char *)addr, size) + 1;
3273
3274         if (mstate->dtms_scratch_ptr + strsz >
3275             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3276                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3277                 return (0);
3278         }
3279
3280         dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3281             strsz);
3282         ret = mstate->dtms_scratch_ptr;
3283         mstate->dtms_scratch_ptr += strsz;
3284         return (ret);
3285 }
3286
3287 /*
3288  * This function implements the DIF emulator's variable lookups.  The emulator
3289  * passes a reserved variable identifier and optional built-in array index.
3290  */
3291 static uint64_t
3292 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3293     uint64_t ndx)
3294 {
3295         /*
3296          * If we're accessing one of the uncached arguments, we'll turn this
3297          * into a reference in the args array.
3298          */
3299         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3300                 ndx = v - DIF_VAR_ARG0;
3301                 v = DIF_VAR_ARGS;
3302         }
3303
3304         switch (v) {
3305         case DIF_VAR_ARGS:
3306                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3307                 if (ndx >= sizeof (mstate->dtms_arg) /
3308                     sizeof (mstate->dtms_arg[0])) {
3309                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3310                         dtrace_vstate_t *vstate = &state->dts_vstate;
3311                         dtrace_provider_t *pv;
3312                         uint64_t val;
3313
3314                         pv = mstate->dtms_probe->dtpr_provider;
3315                         if (pv->dtpv_pops.dtps_getargval != NULL)
3316                                 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3317                                     mstate->dtms_probe->dtpr_id,
3318                                     mstate->dtms_probe->dtpr_arg, ndx, aframes);
3319                         /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
3320                         else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
3321                                 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
3322                         }
3323
3324                         else
3325                                 val = dtrace_getarg(ndx, aframes, mstate, vstate);
3326
3327                         /*
3328                          * This is regrettably required to keep the compiler
3329                          * from tail-optimizing the call to dtrace_getarg().
3330                          * The condition always evaluates to true, but the
3331                          * compiler has no way of figuring that out a priori.
3332                          * (None of this would be necessary if the compiler
3333                          * could be relied upon to _always_ tail-optimize
3334                          * the call to dtrace_getarg() -- but it can't.)
3335                          */
3336                         if (mstate->dtms_probe != NULL)
3337                                 return (val);
3338
3339                         ASSERT(0);
3340                 }
3341
3342                 return (mstate->dtms_arg[ndx]);
3343
3344         case DIF_VAR_UREGS: {
3345                 thread_t thread;
3346
3347                 if (!dtrace_priv_proc(state))
3348                         return (0);
3349
3350                 if ((thread = current_thread()) == NULL) {
3351                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3352                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3353                         return (0);
3354                 }
3355
3356                 return (dtrace_getreg(find_user_regs(thread), ndx));
3357         }
3358
3359
3360         case DIF_VAR_CURTHREAD:
3361                 if (!dtrace_priv_kernel(state))
3362                         return (0);
3363
3364                 return ((uint64_t)(uintptr_t)current_thread());
3365
3366         case DIF_VAR_TIMESTAMP:
3367                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3368                         mstate->dtms_timestamp = dtrace_gethrtime();
3369                         mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3370                 }
3371                 return (mstate->dtms_timestamp);
3372
3373         case DIF_VAR_VTIMESTAMP:
3374                 ASSERT(dtrace_vtime_references != 0);
3375                 return (dtrace_get_thread_vtime(current_thread()));
3376
3377         case DIF_VAR_WALLTIMESTAMP:
3378                 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3379                         mstate->dtms_walltimestamp = dtrace_gethrestime();
3380                         mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3381                 }
3382                 return (mstate->dtms_walltimestamp);
3383
3384         case DIF_VAR_MACHTIMESTAMP:
3385                 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3386                         mstate->dtms_machtimestamp = mach_absolute_time();
3387                         mstate->dtms_present |= DTRACE_MSTATE_MACHTIMESTAMP;
3388                 }
3389                 return (mstate->dtms_machtimestamp);
3390
3391         case DIF_VAR_CPU:
3392                 return ((uint64_t) dtrace_get_thread_last_cpu_id(current_thread()));
3393
3394         case DIF_VAR_IPL:
3395                 if (!dtrace_priv_kernel(state))
3396                         return (0);
3397                 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3398                         mstate->dtms_ipl = dtrace_getipl();
3399                         mstate->dtms_present |= DTRACE_MSTATE_IPL;
3400                 }
3401                 return (mstate->dtms_ipl);
3402
3403         case DIF_VAR_EPID:
3404                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3405                 return (mstate->dtms_epid);
3406
3407         case DIF_VAR_ID:
3408                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3409                 return (mstate->dtms_probe->dtpr_id);
3410
3411         case DIF_VAR_STACKDEPTH:
3412                 if (!dtrace_priv_kernel(state))
3413                         return (0);
3414                 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3415                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3416
3417                         mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3418                         mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3419                 }
3420                 return (mstate->dtms_stackdepth);
3421
3422         case DIF_VAR_USTACKDEPTH:
3423                 if (!dtrace_priv_proc(state))
3424                         return (0);
3425                 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3426                         /*
3427                          * See comment in DIF_VAR_PID.
3428                          */
3429                         if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3430                             CPU_ON_INTR(CPU)) {
3431                                 mstate->dtms_ustackdepth = 0;
3432                         } else {
3433                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3434                                 mstate->dtms_ustackdepth =
3435                                     dtrace_getustackdepth();
3436                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3437                         }
3438                         mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3439                 }
3440                 return (mstate->dtms_ustackdepth);
3441
3442         case DIF_VAR_CALLER:
3443                 if (!dtrace_priv_kernel(state))
3444                         return (0);
3445                 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3446                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3447
3448                         if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3449                                 /*
3450                                  * If this is an unanchored probe, we are
3451                                  * required to go through the slow path:
3452                                  * dtrace_caller() only guarantees correct
3453                                  * results for anchored probes.
3454                                  */
3455                                 pc_t caller[2];
3456
3457                                 dtrace_getpcstack(caller, 2, aframes,
3458                                     (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3459                                 mstate->dtms_caller = caller[1];
3460                         } else if ((mstate->dtms_caller =
3461                                 dtrace_caller(aframes)) == (uintptr_t)-1) {
3462                                 /*
3463                                  * We have failed to do this the quick way;
3464                                  * we must resort to the slower approach of
3465                                  * calling dtrace_getpcstack().
3466                                  */
3467                                 pc_t caller;
3468
3469                                 dtrace_getpcstack(&caller, 1, aframes, NULL);
3470                                 mstate->dtms_caller = caller;
3471                         }
3472
3473                         mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3474                 }
3475                 return (mstate->dtms_caller);
3476
3477         case DIF_VAR_UCALLER:
3478                 if (!dtrace_priv_proc(state))
3479                         return (0);
3480
3481                 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3482                         uint64_t ustack[3];
3483
3484                         /*
3485                          * dtrace_getupcstack() fills in the first uint64_t
3486                          * with the current PID.  The second uint64_t will
3487                          * be the program counter at user-level.  The third
3488                          * uint64_t will contain the caller, which is what
3489                          * we're after.
3490                          */
3491                         ustack[2] = 0;
3492                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3493                         dtrace_getupcstack(ustack, 3);
3494                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3495                         mstate->dtms_ucaller = ustack[2];
3496                         mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3497                 }
3498
3499                 return (mstate->dtms_ucaller);
3500
3501         case DIF_VAR_PROBEPROV:
3502                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3503                 return (dtrace_dif_varstr(
3504                     (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3505                     state, mstate));
3506
3507         case DIF_VAR_PROBEMOD:
3508                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3509                 return (dtrace_dif_varstr(
3510                     (uintptr_t)mstate->dtms_probe->dtpr_mod,
3511                     state, mstate));
3512
3513         case DIF_VAR_PROBEFUNC:
3514                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3515                 return (dtrace_dif_varstr(
3516                     (uintptr_t)mstate->dtms_probe->dtpr_func,
3517                     state, mstate));
3518
3519         case DIF_VAR_PROBENAME:
3520                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3521                 return (dtrace_dif_varstr(
3522                     (uintptr_t)mstate->dtms_probe->dtpr_name,
3523                     state, mstate));
3524
3525         case DIF_VAR_PID:
3526                 if (!dtrace_priv_proc_relaxed(state))
3527                         return (0);
3528
3529                 /*
3530                  * Note that we are assuming that an unanchored probe is
3531                  * always due to a high-level interrupt.  (And we're assuming
3532                  * that there is only a single high level interrupt.)
3533                  */
3534                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3535                         /* Anchored probe that fires while on an interrupt accrues to process 0 */
3536                         return 0;
3537
3538                 return ((uint64_t)dtrace_proc_selfpid());
3539
3540         case DIF_VAR_PPID:
3541                 if (!dtrace_priv_proc_relaxed(state))
3542                         return (0);
3543
3544                 /*
3545                  * See comment in DIF_VAR_PID.
3546                  */
3547                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3548                         return (0);
3549
3550                 return ((uint64_t)dtrace_proc_selfppid());
3551
3552         case DIF_VAR_TID:
3553                 /* We do not need to check for null current_thread() */
3554                 return thread_tid(current_thread()); /* globally unique */
3555
3556         case DIF_VAR_PTHREAD_SELF:
3557                 if (!dtrace_priv_proc(state))
3558                         return (0);
3559
3560                 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3561                 return 0;
3562
3563         case DIF_VAR_DISPATCHQADDR:
3564                 if (!dtrace_priv_proc(state))
3565                         return (0);
3566
3567                 /* We do not need to check for null current_thread() */
3568                 return thread_dispatchqaddr(current_thread());
3569
3570         case DIF_VAR_EXECNAME:
3571         {
3572                 char *xname = (char *)mstate->dtms_scratch_ptr;
3573                 size_t scratch_size = MAXCOMLEN+1;
3574
3575                 /* The scratch allocation's lifetime is that of the clause. */
3576                 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3577                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3578                         return 0;
3579                 }
3580
3581                 if (!dtrace_priv_proc_relaxed(state))
3582                         return (0);
3583
3584                 mstate->dtms_scratch_ptr += scratch_size;
3585                 proc_selfname( xname, scratch_size );
3586
3587                 return ((uint64_t)(uintptr_t)xname);
3588         }
3589
3590
3591         case DIF_VAR_ZONENAME:
3592         {
3593                 /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3594                 char *zname = (char *)mstate->dtms_scratch_ptr;
3595                 size_t scratch_size = 6 + 1;
3596
3597                 if (!dtrace_priv_proc(state))
3598                         return (0);
3599
3600                 /* The scratch allocation's lifetime is that of the clause. */
3601                 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3602                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3603                         return 0;
3604                 }
3605
3606                 mstate->dtms_scratch_ptr += scratch_size;
3607
3608                 /* The kernel does not provide zonename, it will always return 'global'. */
3609                 strlcpy(zname, "global", scratch_size);
3610
3611                 return ((uint64_t)(uintptr_t)zname);
3612         }
3613
3614 #if MONOTONIC
3615         case DIF_VAR_CPUINSTRS:
3616                 return mt_cur_cpu_instrs();
3617
3618         case DIF_VAR_CPUCYCLES:
3619                 return mt_cur_cpu_cycles();
3620
3621         case DIF_VAR_VINSTRS:
3622                 return mt_cur_thread_instrs();
3623
3624         case DIF_VAR_VCYCLES:
3625                 return mt_cur_thread_cycles();
3626 #else /* MONOTONIC */
3627         case DIF_VAR_CPUINSTRS: /* FALLTHROUGH */
3628         case DIF_VAR_CPUCYCLES: /* FALLTHROUGH */
3629         case DIF_VAR_VINSTRS: /* FALLTHROUGH */
3630         case DIF_VAR_VCYCLES: /* FALLTHROUGH */
3631                 return 0;
3632 #endif /* !MONOTONIC */
3633
3634         case DIF_VAR_UID:
3635                 if (!dtrace_priv_proc_relaxed(state))
3636                         return (0);
3637
3638                 /*
3639                  * See comment in DIF_VAR_PID.
3640                  */
3641                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3642                         return (0);
3643
3644                 return ((uint64_t) dtrace_proc_selfruid());
3645
3646         case DIF_VAR_GID:
3647                 if (!dtrace_priv_proc(state))
3648                         return (0);
3649
3650                 /*
3651                  * See comment in DIF_VAR_PID.
3652                  */
3653                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3654                         return (0);
3655
3656                 if (dtrace_CRED() != NULL)
3657                         /* Credential does not require lazy initialization. */
3658                         return ((uint64_t)kauth_getgid());
3659                 else {
3660                         /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3661                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3662                         return -1ULL;
3663                 }
3664
3665         case DIF_VAR_ERRNO: {
3666                 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3667                 if (!dtrace_priv_proc(state))
3668                         return (0);
3669
3670                 /*
3671                  * See comment in DIF_VAR_PID.
3672                  */
3673                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3674                         return (0);
3675
3676                 if (uthread)
3677                         return (uint64_t)uthread->t_dtrace_errno;
3678                 else {
3679                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3680                         return -1ULL;
3681                 }
3682         }
3683
3684         default:
3685                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3686                 return (0);
3687         }
3688 }
3689
3690 typedef enum dtrace_json_state {
3691         DTRACE_JSON_REST = 1,
3692         DTRACE_JSON_OBJECT,
3693         DTRACE_JSON_STRING,
3694         DTRACE_JSON_STRING_ESCAPE,
3695         DTRACE_JSON_STRING_ESCAPE_UNICODE,
3696         DTRACE_JSON_COLON,
3697         DTRACE_JSON_COMMA,
3698         DTRACE_JSON_VALUE,
3699         DTRACE_JSON_IDENTIFIER,
3700         DTRACE_JSON_NUMBER,
3701         DTRACE_JSON_NUMBER_FRAC,
3702         DTRACE_JSON_NUMBER_EXP,
3703         DTRACE_JSON_COLLECT_OBJECT
3704 } dtrace_json_state_t;
3705
3706 /*
3707  * This function possesses just enough knowledge about JSON to extract a single
3708  * value from a JSON string and store it in the scratch buffer.  It is able
3709  * to extract nested object values, and members of arrays by index.
3710  *
3711  * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3712  * be looked up as we descend into the object tree.  e.g.
3713  *
3714  *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3715  *       with nelems = 5.
3716  *
3717  * The run time of this function must be bounded above by strsize to limit the
3718  * amount of work done in probe context.  As such, it is implemented as a
3719  * simple state machine, reading one character at a time using safe loads
3720  * until we find the requested element, hit a parsing error or run off the
3721  * end of the object or string.
3722  *
3723  * As there is no way for a subroutine to return an error without interrupting
3724  * clause execution, we simply return NULL in the event of a missing key or any
3725  * other error condition.  Each NULL return in this function is commented with
3726  * the error condition it represents -- parsing or otherwise.
3727  *
3728  * The set of states for the state machine closely matches the JSON
3729  * specification (http://json.org/).  Briefly:
3730  *
3731  *   DTRACE_JSON_REST:
3732  *     Skip whitespace until we find either a top-level Object, moving
3733  *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3734  *
3735  *   DTRACE_JSON_OBJECT:
3736  *     Locate the next key String in an Object.  Sets a flag to denote
3737  *     the next String as a key string and moves to DTRACE_JSON_STRING.
3738  *
3739  *   DTRACE_JSON_COLON:
3740  *     Skip whitespace until we find the colon that separates key Strings
3741  *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3742  *
3743  *   DTRACE_JSON_VALUE:
3744  *     Detects the type of the next value (String, Number, Identifier, Object
3745  *     or Array) and routes to the states that process that type.  Here we also
3746  *     deal with the element selector list if we are requested to traverse down
3747  *     into the object tree.
3748  *
3749  *   DTRACE_JSON_COMMA:
3750  *     Skip whitespace until we find the comma that separates key-value pairs
3751  *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3752  *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
3753  *     states return to this state at the end of their value, unless otherwise
3754  *     noted.
3755  *
3756  *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3757  *     Processes a Number literal from the JSON, including any exponent
3758  *     component that may be present.  Numbers are returned as strings, which
3759  *     may be passed to strtoll() if an integer is required.
3760  *
3761  *   DTRACE_JSON_IDENTIFIER:
3762  *     Processes a "true", "false" or "null" literal in the JSON.
3763  *
3764  *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3765  *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
3766  *     Processes a String literal from the JSON, whether the String denotes
3767  *     a key, a value or part of a larger Object.  Handles all escape sequences
3768  *     present in the specification, including four-digit unicode characters,
3769  *     but merely includes the escape sequence without converting it to the
3770  *     actual escaped character.  If the String is flagged as a key, we
3771  *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3772  *
3773  *   DTRACE_JSON_COLLECT_OBJECT:
3774  *     This state collects an entire Object (or Array), correctly handling
3775  *     embedded strings.  If the full element selector list matches this nested
3776  *     object, we return the Object in full as a string.  If not, we use this
3777  *     state to skip to the next value at this level and continue processing.
3778  */
3779 static char *
3780 dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3781     char *dest)
3782 {
3783         dtrace_json_state_t state = DTRACE_JSON_REST;
3784         int64_t array_elem = INT64_MIN;
3785         int64_t array_pos = 0;
3786         uint8_t escape_unicount = 0;
3787         boolean_t string_is_key = B_FALSE;
3788         boolean_t collect_object = B_FALSE;
3789         boolean_t found_key = B_FALSE;
3790         boolean_t in_array = B_FALSE;
3791         uint32_t braces = 0, brackets = 0;
3792         char *elem = elemlist;
3793         char *dd = dest;
3794         uintptr_t cur;
3795
3796         for (cur = json; cur < json + size; cur++) {
3797                 char cc = dtrace_load8(cur);
3798                 if (cc == '\0')
3799                         return (NULL);
3800
3801                 switch (state) {
3802                 case DTRACE_JSON_REST:
3803                         if (isspace(cc))
3804                                 break;
3805
3806                         if (cc == '{') {
3807                                 state = DTRACE_JSON_OBJECT;
3808                                 break;
3809                         }
3810
3811                         if (cc == '[') {
3812                                 in_array = B_TRUE;
3813                                 array_pos = 0;
3814                                 array_elem = dtrace_strtoll(elem, 10, size);
3815                                 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3816                                 state = DTRACE_JSON_VALUE;
3817                                 break;
3818                         }
3819
3820                         /*
3821                          * ERROR: expected to find a top-level object or array.
3822                          */
3823                         return (NULL);
3824                 case DTRACE_JSON_OBJECT:
3825                         if (isspace(cc))
3826                                 break;
3827
3828                         if (cc == '"') {
3829                                 state = DTRACE_JSON_STRING;
3830                                 string_is_key = B_TRUE;
3831                                 break;
3832                         }
3833
3834                         /*
3835                          * ERROR: either the object did not start with a key
3836                          * string, or we've run off the end of the object
3837                          * without finding the requested key.
3838                          */
3839                         return (NULL);
3840                 case DTRACE_JSON_STRING:
3841                         if (cc == '\\') {
3842                                 *dd++ = '\\';
3843                                 state = DTRACE_JSON_STRING_ESCAPE;
3844                                 break;
3845                         }
3846
3847                         if (cc == '"') {
3848                                 if (collect_object) {
3849                                         /*
3850                                          * We don't reset the dest here, as
3851                                          * the string is part of a larger
3852                                          * object being collected.
3853                                          */
3854                                         *dd++ = cc;
3855                                         collect_object = B_FALSE;
3856                                         state = DTRACE_JSON_COLLECT_OBJECT;
3857                                         break;
3858                                 }
3859                                 *dd = '\0';
3860                                 dd = dest; /* reset string buffer */
3861                                 if (string_is_key) {
3862                                         if (dtrace_strncmp(dest, elem,
3863                                             size) == 0)
3864                                                 found_key = B_TRUE;
3865                                 } else if (found_key) {
3866                                         if (nelems > 1) {
3867                                                 /*
3868                                                  * We expected an object, not
3869                                                  * this string.
3870                                                  */
3871                                                 return (NULL);
3872                                         }
3873                                         return (dest);
3874                                 }
3875                                 state = string_is_key ? DTRACE_JSON_COLON :
3876                                     DTRACE_JSON_COMMA;
3877                                 string_is_key = B_FALSE;
3878                                 break;
3879                         }
3880
3881                         *dd++ = cc;
3882                         break;
3883                 case DTRACE_JSON_STRING_ESCAPE:
3884                         *dd++ = cc;
3885                         if (cc == 'u') {
3886                                 escape_unicount = 0;
3887                                 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3888                         } else {
3889                                 state = DTRACE_JSON_STRING;
3890                         }
3891                         break;
3892                 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3893                         if (!isxdigit(cc)) {
3894                                 /*
3895                                  * ERROR: invalid unicode escape, expected
3896                                  * four valid hexidecimal digits.
3897                                  */
3898                                 return (NULL);
3899                         }
3900
3901                         *dd++ = cc;
3902                         if (++escape_unicount == 4)
3903                                 state = DTRACE_JSON_STRING;
3904                         break;
3905                 case DTRACE_JSON_COLON:
3906                         if (isspace(cc))
3907                                 break;
3908
3909                         if (cc == ':') {
3910                                 state = DTRACE_JSON_VALUE;
3911                                 break;
3912                         }
3913
3914                         /*
3915                          * ERROR: expected a colon.
3916                          */
3917                         return (NULL);
3918                 case DTRACE_JSON_COMMA:
3919                         if (isspace(cc))
3920                                 break;
3921
3922                         if (cc == ',') {
3923                                 if (in_array) {
3924                                         state = DTRACE_JSON_VALUE;
3925                                         if (++array_pos == array_elem)
3926                                                 found_key = B_TRUE;
3927                                 } else {
3928                                         state = DTRACE_JSON_OBJECT;
3929                                 }
3930                                 break;
3931                         }
3932
3933                         /*
3934                          * ERROR: either we hit an unexpected character, or
3935                          * we reached the end of the object or array without
3936                          * finding the requested key.
3937                          */
3938                         return (NULL);
3939                 case DTRACE_JSON_IDENTIFIER:
3940                         if (islower(cc)) {
3941                                 *dd++ = cc;
3942                                 break;
3943                         }
3944
3945                         *dd = '\0';
3946                         dd = dest; /* reset string buffer */
3947
3948                         if (dtrace_strncmp(dest, "true", 5) == 0 ||
3949                             dtrace_strncmp(dest, "false", 6) == 0 ||
3950                             dtrace_strncmp(dest, "null", 5) == 0) {
3951                                 if (found_key) {
3952                                         if (nelems > 1) {
3953                                                 /*
3954                                                  * ERROR: We expected an object,
3955                                                  * not this identifier.
3956                                                  */
3957                                                 return (NULL);
3958                                         }
3959                                         return (dest);
3960                                 } else {
3961                                         cur--;
3962                                         state = DTRACE_JSON_COMMA;
3963                                         break;
3964                                 }
3965                         }
3966
3967                         /*
3968                          * ERROR: we did not recognise the identifier as one
3969                          * of those in the JSON specification.
3970                          */
3971                         return (NULL);
3972                 case DTRACE_JSON_NUMBER:
3973                         if (cc == '.') {
3974                                 *dd++ = cc;
3975                                 state = DTRACE_JSON_NUMBER_FRAC;
3976                                 break;
3977                         }
3978
3979                         if (cc == 'x' || cc == 'X') {
3980                                 /*
3981                                  * ERROR: specification explicitly excludes
3982                                  * hexidecimal or octal numbers.
3983                                  */
3984                                 return (NULL);
3985                         }
3986
3987                         /* FALLTHRU */
3988                 case DTRACE_JSON_NUMBER_FRAC:
3989                         if (cc == 'e' || cc == 'E') {
3990                                 *dd++ = cc;
3991                                 state = DTRACE_JSON_NUMBER_EXP;
3992                                 break;
3993                         }
3994
3995                         if (cc == '+' || cc == '-') {
3996                                 /*
3997                                  * ERROR: expect sign as part of exponent only.
3998                                  */
3999                                 return (NULL);
4000                         }
4001                         /* FALLTHRU */
4002                 case DTRACE_JSON_NUMBER_EXP:
4003                         if (isdigit(cc) || cc == '+' || cc == '-') {
4004                                 *dd++ = cc;
4005                                 break;
4006                         }
4007
4008                         *dd = '\0';
4009                         dd = dest; /* reset string buffer */
4010                         if (found_key) {
4011                                 if (nelems > 1) {
4012                                         /*
4013                                          * ERROR: We expected an object, not
4014                                          * this number.
4015                                          */
4016                                         return (NULL);
4017                                 }
4018                                 return (dest);
4019                         }
4020
4021                         cur--;
4022                         state = DTRACE_JSON_COMMA;
4023                         break;
4024                 case DTRACE_JSON_VALUE:
4025                         if (isspace(cc))
4026                                 break;
4027
4028                         if (cc == '{' || cc == '[') {
4029                                 if (nelems > 1 && found_key) {
4030                                         in_array = cc == '[' ? B_TRUE : B_FALSE;
4031                                         /*
4032                                          * If our element selector directs us
4033                                          * to descend into this nested object,
4034                                          * then move to the next selector
4035                                          * element in the list and restart the
4036                                          * state machine.
4037                                          */
4038                                         while (*elem != '\0')
4039                                                 elem++;
4040                                         elem++; /* skip the inter-element NUL */
4041                                         nelems--;
4042                                         dd = dest;
4043                                         if (in_array) {
4044                                                 state = DTRACE_JSON_VALUE;
4045                                                 array_pos = 0;
4046                                                 array_elem = dtrace_strtoll(
4047                                                     elem, 10, size);
4048                                                 found_key = array_elem == 0 ?
4049                                                     B_TRUE : B_FALSE;
4050                                         } else {
4051                                                 found_key = B_FALSE;
4052                                                 state = DTRACE_JSON_OBJECT;
4053                                         }
4054                                         break;
4055                                 }
4056
4057                                 /*
4058                                  * Otherwise, we wish to either skip this
4059                                  * nested object or return it in full.
4060                                  */
4061                                 if (cc == '[')
4062                                         brackets = 1;
4063                                 else
4064                                         braces = 1;
4065                                 *dd++ = cc;
4066                                 state = DTRACE_JSON_COLLECT_OBJECT;
4067                                 break;
4068                         }
4069
4070                         if (cc == '"') {
4071                                 state = DTRACE_JSON_STRING;
4072                                 break;
4073                         }
4074
4075                         if (islower(cc)) {
4076                                 /*
4077                                  * Here we deal with true, false and null.
4078                                  */
4079                                 *dd++ = cc;
4080                                 state = DTRACE_JSON_IDENTIFIER;
4081                                 break;
4082                         }
4083
4084                         if (cc == '-' || isdigit(cc)) {
4085                                 *dd++ = cc;
4086                                 state = DTRACE_JSON_NUMBER;
4087                                 break;
4088                         }
4089
4090                         /*
4091                          * ERROR: unexpected character at start of value.
4092                          */
4093                         return (NULL);
4094                 case DTRACE_JSON_COLLECT_OBJECT:
4095                         if (cc == '\0')
4096                                 /*
4097                                  * ERROR: unexpected end of input.
4098                                  */
4099                                 return (NULL);
4100
4101                         *dd++ = cc;
4102                         if (cc == '"') {
4103                                 collect_object = B_TRUE;
4104                                 state = DTRACE_JSON_STRING;
4105                                 break;
4106                         }
4107
4108                         if (cc == ']') {
4109                                 if (brackets-- == 0) {
4110                                         /*
4111                                          * ERROR: unbalanced brackets.
4112                                          */
4113                                         return (NULL);
4114                                 }
4115                         } else if (cc == '}') {
4116                                 if (braces-- == 0) {
4117                                         /*
4118                                          * ERROR: unbalanced braces.
4119                                          */
4120                                         return (NULL);
4121                                 }
4122                         } else if (cc == '{') {
4123                                 braces++;
4124                         } else if (cc == '[') {
4125                                 brackets++;
4126                         }
4127
4128                         if (brackets == 0 && braces == 0) {
4129                                 if (found_key) {
4130                                         *dd = '\0';
4131                                         return (dest);
4132                                 }
4133                                 dd = dest; /* reset string buffer */
4134                                 state = DTRACE_JSON_COMMA;
4135                         }
4136                         break;
4137                 }
4138         }
4139         return (NULL);
4140 }
4141
4142 /*
4143  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4144  * Notice that we don't bother validating the proper number of arguments or
4145  * their types in the tuple stack.  This isn't needed because all argument
4146  * interpretation is safe because of our load safety -- the worst that can
4147  * happen is that a bogus program can obtain bogus results.
4148  */
4149 static void
4150 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4151     dtrace_key_t *tupregs, int nargs,
4152     dtrace_mstate_t *mstate, dtrace_state_t *state)
4153 {
4154         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4155         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4156         dtrace_vstate_t *vstate = &state->dts_vstate;
4157
4158 #if !defined(__APPLE__)
4159         union {
4160                 mutex_impl_t mi;
4161                 uint64_t mx;
4162         } m;
4163
4164         union {
4165                 krwlock_t ri;
4166                 uintptr_t rw;
4167         } r;
4168 #else
4169 /* FIXME: awaits lock/mutex work */
4170 #endif /* __APPLE__ */
4171
4172         switch (subr) {
4173         case DIF_SUBR_RAND:
4174                 regs[rd] = dtrace_xoroshiro128_plus_next(
4175                     state->dts_rstate[CPU->cpu_id]);
4176                 break;
4177
4178 #if !defined(__APPLE__)
4179         case DIF_SUBR_MUTEX_OWNED:
4180                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4181                     mstate, vstate)) {
4182                         regs[rd] = 0;
4183                         break;
4184                 }
4185
4186                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4187                 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4188                         regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4189                 else
4190                         regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4191                 break;
4192
4193         case DIF_SUBR_MUTEX_OWNER:
4194                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4195                     mstate, vstate)) {
4196                         regs[rd] = 0;
4197                         break;
4198                 }
4199
4200                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4201                 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4202                     MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4203                         regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4204                 else
4205                         regs[rd] = 0;
4206                 break;
4207
4208         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4209                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4210                     mstate, vstate)) {
4211                         regs[rd] = 0;
4212                         break;
4213                 }
4214
4215                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4216                 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4217                 break;
4218
4219         case DIF_SUBR_MUTEX_TYPE_SPIN:
4220                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4221                     mstate, vstate)) {
4222                         regs[rd] = 0;
4223                         break;
4224                 }
4225
4226                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4227                 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4228                 break;
4229
4230         case DIF_SUBR_RW_READ_HELD: {
4231                 uintptr_t tmp;
4232
4233                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4234                     mstate, vstate)) {
4235                         regs[rd] = 0;
4236                         break;
4237                 }
4238
4239                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4240                 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4241                 break;
4242         }
4243
4244         case DIF_SUBR_RW_WRITE_HELD:
4245                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4246                     mstate, vstate)) {
4247                         regs[rd] = 0;
4248                         break;
4249                 }
4250
4251                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4252                 regs[rd] = _RW_WRITE_HELD(&r.ri);
4253                 break;
4254
4255         case DIF_SUBR_RW_ISWRITER:
4256                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4257                     mstate, vstate)) {
4258                         regs[rd] = 0;
4259                         break;
4260                 }
4261
4262                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4263                 regs[rd] = _RW_ISWRITER(&r.ri);
4264                 break;
4265 #else
4266 /* FIXME: awaits lock/mutex work */
4267 #endif /* __APPLE__ */
4268
4269         case DIF_SUBR_BCOPY: {
4270                 /*
4271                  * We need to be sure that the destination is in the scratch
4272                  * region -- no other region is allowed.
4273                  */
4274                 uintptr_t src = tupregs[0].dttk_value;
4275                 uintptr_t dest = tupregs[1].dttk_value;
4276                 size_t size = tupregs[2].dttk_value;
4277
4278                 if (!dtrace_inscratch(dest, size, mstate)) {
4279                         *flags |= CPU_DTRACE_BADADDR;
4280                         *illval = regs[rd];
4281                         break;
4282                 }
4283
4284                 if (!dtrace_canload(src, size, mstate, vstate)) {
4285                         regs[rd] = 0;
4286                         break;
4287                 }
4288
4289                 dtrace_bcopy((void *)src, (void *)dest, size);
4290                 break;
4291         }
4292
4293         case DIF_SUBR_ALLOCA:
4294         case DIF_SUBR_COPYIN: {
4295                 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4296                 uint64_t size =
4297                     tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4298                 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4299
4300                 /*
4301                  * Check whether the user can access kernel memory
4302                  */
4303                 if (dtrace_priv_kernel(state) == 0) {
4304                         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
4305                         regs[rd] = 0;
4306                         break;
4307                 }
4308                 /*
4309                  * This action doesn't require any credential checks since
4310                  * probes will not activate in user contexts to which the
4311                  * enabling user does not have permissions.
4312                  */
4313
4314                 /*
4315                  * Rounding up the user allocation size could have overflowed
4316                  * a large, bogus allocation (like -1ULL) to 0.
4317                  */
4318                 if (scratch_size < size ||
4319                     !DTRACE_INSCRATCH(mstate, scratch_size)) {
4320                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4321                         regs[rd] = 0;
4322                         break;
4323                 }
4324
4325                 if (subr == DIF_SUBR_COPYIN) {
4326                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4327                         if (dtrace_priv_proc(state))
4328                                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4329                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4330                 }
4331
4332                 mstate->dtms_scratch_ptr += scratch_size;
4333                 regs[rd] = dest;
4334                 break;
4335         }
4336
4337         case DIF_SUBR_COPYINTO: {
4338                 uint64_t size = tupregs[1].dttk_value;
4339                 uintptr_t dest = tupregs[2].dttk_value;
4340
4341                 /*
4342                  * This action doesn't require any credential checks since
4343                  * probes will not activate in user contexts to which the
4344                  * enabling user does not have permissions.
4345                  */
4346                 if (!dtrace_inscratch(dest, size, mstate)) {
4347                         *flags |= CPU_DTRACE_BADADDR;
4348                         *illval = regs[rd];
4349                         break;
4350                 }
4351
4352                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4353                 if (dtrace_priv_proc(state))
4354                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4355                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4356                 break;
4357         }
4358
4359         case DIF_SUBR_COPYINSTR: {
4360                 uintptr_t dest = mstate->dtms_scratch_ptr;
4361                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4362
4363                 if (nargs > 1 && tupregs[1].dttk_value < size)
4364                         size = tupregs[1].dttk_value + 1;
4365
4366                 /*
4367                  * This action doesn't require any credential checks since
4368                  * probes will not activate in user contexts to which the
4369                  * enabling user does not have permissions.
4370                  */
4371                 if (!DTRACE_INSCRATCH(mstate, size)) {
4372                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4373                         regs[rd] = 0;
4374                         break;
4375                 }
4376
4377                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4378                 if (dtrace_priv_proc(state))
4379                         dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4380                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4381
4382                 ((char *)dest)[size - 1] = '\0';
4383                 mstate->dtms_scratch_ptr += size;
4384                 regs[rd] = dest;
4385                 break;
4386         }
4387
4388         case DIF_SUBR_MSGSIZE:
4389         case DIF_SUBR_MSGDSIZE: {
4390                 /* Darwin does not implement SysV streams messages */
4391                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4392                 regs[rd] = 0;
4393                 break;
4394         }
4395
4396         case DIF_SUBR_PROGENYOF: {
4397                 pid_t pid = tupregs[0].dttk_value;
4398                 struct proc *p = current_proc();
4399                 int rval = 0, lim = nprocs;
4400
4401                 while(p && (lim-- > 0)) {
4402                         pid_t ppid;
4403
4404                         ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
4405                         if (*flags & CPU_DTRACE_FAULT)
4406                                 break;
4407
4408                         if (ppid == pid) {
4409                                 rval = 1;
4410                                 break;
4411                         }
4412
4413                         if (ppid == 0)
4414                                 break; /* Can't climb process tree any further. */
4415
4416                         p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
4417                         if (*flags & CPU_DTRACE_FAULT)
4418                                 break;
4419                 }
4420
4421                 regs[rd] = rval;
4422                 break;
4423         }
4424
4425         case DIF_SUBR_SPECULATION:
4426                 regs[rd] = dtrace_speculation(state);
4427                 break;
4428
4429
4430         case DIF_SUBR_COPYOUT: {
4431                 uintptr_t kaddr = tupregs[0].dttk_value;
4432                 user_addr_t uaddr = tupregs[1].dttk_value;
4433                 uint64_t size = tupregs[2].dttk_value;
4434
4435                 if (!dtrace_destructive_disallow &&
4436                     dtrace_priv_proc_control(state) &&
4437                     !dtrace_istoxic(kaddr, size) &&
4438                     dtrace_canload(kaddr, size, mstate, vstate)) {
4439                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4440                         dtrace_copyout(kaddr, uaddr, size, flags);
4441                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4442                 }
4443                 break;
4444         }
4445
4446         case DIF_SUBR_COPYOUTSTR: {
4447                 uintptr_t kaddr = tupregs[0].dttk_value;
4448                 user_addr_t uaddr = tupregs[1].dttk_value;
4449                 uint64_t size = tupregs[2].dttk_value;
4450                 size_t lim;
4451
4452                 if (!dtrace_destructive_disallow &&
4453                     dtrace_priv_proc_control(state) &&
4454                     !dtrace_istoxic(kaddr, size) &&
4455                     dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
4456                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4457                         dtrace_copyoutstr(kaddr, uaddr, lim, flags);
4458                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4459                 }
4460                 break;
4461         }
4462
4463         case DIF_SUBR_STRLEN: {
4464                 size_t size = state->dts_options[DTRACEOPT_STRSIZE];
4465                 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4466                 size_t lim;
4467
4468                 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4469                         regs[rd] = 0;
4470                         break;
4471                 }
4472
4473                 regs[rd] = dtrace_strlen((char *)addr, lim);
4474
4475                 break;
4476         }
4477
4478         case DIF_SUBR_STRCHR:
4479         case DIF_SUBR_STRRCHR: {
4480                 /*
4481                  * We're going to iterate over the string looking for the
4482                  * specified character.  We will iterate until we have reached
4483                  * the string length or we have found the character.  If this
4484                  * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4485                  * of the specified character instead of the first.
4486                  */
4487                 uintptr_t addr = tupregs[0].dttk_value;
4488                 uintptr_t addr_limit;
4489                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4490                 size_t lim;
4491                 char c, target = (char)tupregs[1].dttk_value;
4492
4493                 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4494                         regs[rd] = 0;
4495                         break;
4496                 }
4497                 addr_limit = addr + lim;
4498
4499                 for (regs[rd] = 0; addr < addr_limit; addr++) {
4500                         if ((c = dtrace_load8(addr)) == target) {
4501                                 regs[rd] = addr;
4502
4503                                 if (subr == DIF_SUBR_STRCHR)
4504                                         break;
4505                         }
4506
4507                         if (c == '\0')
4508                                 break;
4509                 }
4510
4511                 break;
4512         }
4513
4514         case DIF_SUBR_STRSTR:
4515         case DIF_SUBR_INDEX:
4516         case DIF_SUBR_RINDEX: {
4517                 /*
4518                  * We're going to iterate over the string looking for the
4519                  * specified string.  We will iterate until we have reached
4520                  * the string length or we have found the string.  (Yes, this
4521                  * is done in the most naive way possible -- but considering
4522                  * that the string we're searching for is likely to be
4523                  * relatively short, the complexity of Rabin-Karp or similar
4524                  * hardly seems merited.)
4525                  */
4526                 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4527                 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4528                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4529                 size_t len = dtrace_strlen(addr, size);
4530                 size_t sublen = dtrace_strlen(substr, size);
4531                 char *limit = addr + len, *orig = addr;
4532                 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4533                 int inc = 1;
4534
4535                 regs[rd] = notfound;
4536
4537                 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4538                         regs[rd] = 0;
4539                         break;
4540                 }
4541
4542                 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4543                     vstate)) {
4544                         regs[rd] = 0;
4545                         break;
4546                 }
4547
4548                 /*
4549                  * strstr() and index()/rindex() have similar semantics if
4550                  * both strings are the empty string: strstr() returns a
4551                  * pointer to the (empty) string, and index() and rindex()
4552                  * both return index 0 (regardless of any position argument).
4553                  */
4554                 if (sublen == 0 && len == 0) {
4555                         if (subr == DIF_SUBR_STRSTR)
4556                                 regs[rd] = (uintptr_t)addr;
4557                         else
4558                                 regs[rd] = 0;
4559                         break;
4560                 }
4561
4562                 if (subr != DIF_SUBR_STRSTR) {
4563                         if (subr == DIF_SUBR_RINDEX) {
4564                                 limit = orig - 1;
4565                                 addr += len;
4566                                 inc = -1;
4567                         }
4568
4569                         /*
4570                          * Both index() and rindex() take an optional position
4571                          * argument that denotes the starting position.
4572                          */
4573                         if (nargs == 3) {
4574                                 int64_t pos = (int64_t)tupregs[2].dttk_value;
4575
4576                                 /*
4577                                  * If the position argument to index() is
4578                                  * negative, Perl implicitly clamps it at
4579                                  * zero.  This semantic is a little surprising
4580                                  * given the special meaning of negative
4581                                  * positions to similar Perl functions like
4582                                  * substr(), but it appears to reflect a
4583                                  * notion that index() can start from a
4584                                  * negative index and increment its way up to
4585                                  * the string.  Given this notion, Perl's
4586                                  * rindex() is at least self-consistent in
4587                                  * that it implicitly clamps positions greater
4588                                  * than the string length to be the string
4589                                  * length.  Where Perl completely loses
4590                                  * coherence, however, is when the specified
4591                                  * substring is the empty string ("").  In
4592                                  * this case, even if the position is
4593                                  * negative, rindex() returns 0 -- and even if
4594                                  * the position is greater than the length,
4595                                  * index() returns the string length.  These
4596                                  * semantics violate the notion that index()
4597                                  * should never return a value less than the
4598                                  * specified position and that rindex() should
4599                                  * never return a value greater than the
4600                                  * specified position.  (One assumes that
4601                                  * these semantics are artifacts of Perl's
4602                                  * implementation and not the results of
4603                                  * deliberate design -- it beggars belief that
4604                                  * even Larry Wall could desire such oddness.)
4605                                  * While in the abstract one would wish for
4606                                  * consistent position semantics across
4607                                  * substr(), index() and rindex() -- or at the
4608                                  * very least self-consistent position
4609                                  * semantics for index() and rindex() -- we
4610                                  * instead opt to keep with the extant Perl
4611                                  * semantics, in all their broken glory.  (Do
4612                                  * we have more desire to maintain Perl's
4613                                  * semantics than Perl does?  Probably.)
4614                                  */
4615                                 if (subr == DIF_SUBR_RINDEX) {
4616                                         if (pos < 0) {
4617                                                 if (sublen == 0)
4618                                                         regs[rd] = 0;
4619                                                 break;
4620                                         }
4621
4622                                         if ((size_t)pos > len)
4623                                                 pos = len;
4624                                 } else {
4625                                         if (pos < 0)
4626                                                 pos = 0;
4627
4628                                         if ((size_t)pos >= len) {
4629                                                 if (sublen == 0)
4630                                                         regs[rd] = len;
4631                                                 break;
4632                                         }
4633                                 }
4634
4635                                 addr = orig + pos;
4636                         }
4637                 }
4638
4639                 for (regs[rd] = notfound; addr != limit; addr += inc) {
4640                         if (dtrace_strncmp(addr, substr, sublen) == 0) {
4641                                 if (subr != DIF_SUBR_STRSTR) {
4642                                         /*
4643                                          * As D index() and rindex() are
4644                                          * modeled on Perl (and not on awk),
4645                                          * we return a zero-based (and not a
4646                                          * one-based) index.  (For you Perl
4647                                          * weenies: no, we're not going to add
4648                                          * $[ -- and shouldn't you be at a con
4649                                          * or something?)
4650                                          */
4651                                         regs[rd] = (uintptr_t)(addr - orig);
4652                                         break;
4653                                 }
4654
4655                                 ASSERT(subr == DIF_SUBR_STRSTR);
4656                                 regs[rd] = (uintptr_t)addr;
4657                                 break;
4658                         }
4659                 }
4660
4661                 break;
4662         }
4663
4664         case DIF_SUBR_STRTOK: {
4665                 uintptr_t addr = tupregs[0].dttk_value;
4666                 uintptr_t tokaddr = tupregs[1].dttk_value;
4667                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4668                 uintptr_t limit, toklimit;
4669                 size_t clim;
4670                 char *dest = (char *)mstate->dtms_scratch_ptr;
4671                 uint8_t c='\0', tokmap[32];      /* 256 / 8 */
4672                 uint64_t i = 0;
4673
4674                 /*
4675                  * Check both the token buffer and (later) the input buffer,
4676                  * since both could be non-scratch addresses.
4677                  */
4678                 if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4679                         regs[rd] = 0;
4680                         break;
4681                 }
4682                 toklimit = tokaddr + clim;
4683
4684                 if (!DTRACE_INSCRATCH(mstate, size)) {
4685                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4686                         regs[rd] = 0;
4687                         break;
4688                 }
4689
4690                 if (addr == 0) {
4691                         /*
4692                          * If the address specified is NULL, we use our saved
4693                          * strtok pointer from the mstate.  Note that this
4694                          * means that the saved strtok pointer is _only_
4695                          * valid within multiple enablings of the same probe --
4696                          * it behaves like an implicit clause-local variable.
4697                          */
4698                         addr = mstate->dtms_strtok;
4699                         limit = mstate->dtms_strtok_limit;
4700                 } else {
4701                         /*
4702                          * If the user-specified address is non-NULL we must
4703                          * access check it.  This is the only time we have
4704                          * a chance to do so, since this address may reside
4705                          * in the string table of this clause-- future calls
4706                          * (when we fetch addr from mstate->dtms_strtok)
4707                          * would fail this access check.
4708                          */
4709                         if (!dtrace_strcanload(addr, size, &clim, mstate,
4710                                 vstate)) {
4711                                 regs[rd] = 0;
4712                                 break;
4713                         }
4714                         limit = addr + clim;
4715                 }
4716
4717                 /*
4718                  * First, zero the token map, and then process the token
4719                  * string -- setting a bit in the map for every character
4720                  * found in the token string.
4721                  */
4722                 for (i = 0; i < (int)sizeof (tokmap); i++)
4723                         tokmap[i] = 0;
4724
4725                 for (; tokaddr < toklimit; tokaddr++) {
4726                         if ((c = dtrace_load8(tokaddr)) == '\0')
4727                                 break;
4728
4729                         ASSERT((c >> 3) < sizeof (tokmap));
4730                         tokmap[c >> 3] |= (1 << (c & 0x7));
4731                 }
4732
4733                 for (; addr < limit; addr++) {
4734                         /*
4735                          * We're looking for a character that is _not_
4736                          * contained in the token string.
4737                          */
4738                         if ((c = dtrace_load8(addr)) == '\0')
4739                                 break;
4740
4741                         if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4742                                 break;
4743                 }
4744
4745                 if (c == '\0') {
4746                         /*
4747                          * We reached the end of the string without finding
4748                          * any character that was not in the token string.
4749                          * We return NULL in this case, and we set the saved
4750                          * address to NULL as well.
4751                          */
4752                         regs[rd] = 0;
4753                         mstate->dtms_strtok = 0;
4754                         mstate->dtms_strtok_limit = 0;
4755                         break;
4756                 }
4757
4758                 /*
4759                  * From here on, we're copying into the destination string.
4760                  */
4761                 for (i = 0; addr < limit && i < size - 1; addr++) {
4762                         if ((c = dtrace_load8(addr)) == '\0')
4763                                 break;
4764
4765                         if (tokmap[c >> 3] & (1 << (c & 0x7)))
4766                                 break;
4767
4768                         ASSERT(i < size);
4769                         dest[i++] = c;
4770                 }
4771
4772                 ASSERT(i < size);
4773                 dest[i] = '\0';
4774                 regs[rd] = (uintptr_t)dest;
4775                 mstate->dtms_scratch_ptr += size;
4776                 mstate->dtms_strtok = addr;
4777                 mstate->dtms_strtok_limit = limit;
4778                 break;
4779         }
4780
4781         case DIF_SUBR_SUBSTR: {
4782                 uintptr_t s = tupregs[0].dttk_value;
4783                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4784                 char *d = (char *)mstate->dtms_scratch_ptr;
4785                 int64_t index = (int64_t)tupregs[1].dttk_value;
4786                 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4787                 size_t len = dtrace_strlen((char *)s, size);
4788                 int64_t i = 0;
4789
4790                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4791                         regs[rd] = 0;
4792                         break;
4793                 }
4794
4795                 if (!DTRACE_INSCRATCH(mstate, size)) {
4796                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4797                         regs[rd] = 0;
4798                         break;
4799                 }
4800
4801                 if (nargs <= 2)
4802                         remaining = (int64_t)size;
4803
4804                 if (index < 0) {
4805                         index += len;
4806
4807                         if (index < 0 && index + remaining > 0) {
4808                                 remaining += index;
4809                                 index = 0;
4810                         }
4811                 }
4812
4813                 if ((size_t)index >= len || index < 0) {
4814                         remaining = 0;
4815                 } else if (remaining < 0) {
4816                         remaining += len - index;
4817                 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4818                         remaining = size - index;
4819                 }
4820
4821                 for (i = 0; i < remaining; i++) {
4822                         if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4823                                 break;
4824                         }
4825
4826                 d[i] = '\0';
4827
4828                 mstate->dtms_scratch_ptr += size;
4829                 regs[rd] = (uintptr_t)d;
4830                 break;
4831         }
4832
4833         case DIF_SUBR_GETMAJOR:
4834                 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4835                 break;
4836
4837         case DIF_SUBR_GETMINOR:
4838                 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4839                 break;
4840
4841         case DIF_SUBR_DDI_PATHNAME: {
4842                 /* APPLE NOTE: currently unsupported on Darwin */
4843                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4844                 regs[rd] = 0;
4845                 break;
4846         }
4847
4848         case DIF_SUBR_STRJOIN: {
4849                 char *d = (char *)mstate->dtms_scratch_ptr;
4850                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4851                 uintptr_t s1 = tupregs[0].dttk_value;
4852                 uintptr_t s2 = tupregs[1].dttk_value;
4853                 uint64_t i = 0, j = 0;
4854                 size_t lim1, lim2;
4855                 char c;
4856
4857                 if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
4858                     !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
4859                         regs[rd] = 0;
4860                         break;
4861                 }
4862
4863                 if (!DTRACE_INSCRATCH(mstate, size)) {
4864                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4865                         regs[rd] = 0;
4866                         break;
4867                 }
4868
4869                 for (;;) {
4870                         if (i >= size) {
4871                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4872                                 regs[rd] = 0;
4873                                 break;
4874                         }
4875                         c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
4876                         if ((d[i++] = c) == '\0') {
4877                                 i--;
4878                                 break;
4879                         }
4880                 }
4881
4882                 for (;;) {
4883                         if (i >= size) {
4884                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4885                                 regs[rd] = 0;
4886                                 break;
4887                         }
4888                         c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
4889                         if ((d[i++] = c) == '\0')
4890                                 break;
4891                 }
4892
4893                 if (i < size) {
4894                         mstate->dtms_scratch_ptr += i;
4895                         regs[rd] = (uintptr_t)d;
4896                 }
4897
4898                 break;
4899         }
4900
4901         case DIF_SUBR_STRTOLL: {
4902                 uintptr_t s = tupregs[0].dttk_value;
4903                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4904                 size_t lim;
4905                 int base = 10;
4906
4907                 if (nargs > 1) {
4908                         if ((base = tupregs[1].dttk_value) <= 1 ||
4909                             base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4910                                 *flags |= CPU_DTRACE_ILLOP;
4911                                 break;
4912                         }
4913                 }
4914
4915                 if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
4916                         regs[rd] = INT64_MIN;
4917                         break;
4918                 }
4919
4920                 regs[rd] = dtrace_strtoll((char *)s, base, lim);
4921                 break;
4922         }
4923
4924         case DIF_SUBR_LLTOSTR: {
4925                 int64_t i = (int64_t)tupregs[0].dttk_value;
4926                 uint64_t val, digit;
4927                 uint64_t size = 65;     /* enough room for 2^64 in binary */
4928                 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4929                 int base = 10;
4930
4931                 if (nargs > 1) {
4932                         if ((base = tupregs[1].dttk_value) <= 1 ||
4933                              base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4934                                 *flags |= CPU_DTRACE_ILLOP;
4935                                 break;
4936                         }
4937                 }
4938
4939                 val = (base == 10 && i < 0) ? i * -1 : i;
4940
4941                 if (!DTRACE_INSCRATCH(mstate, size)) {
4942                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4943                         regs[rd] = 0;
4944                         break;
4945                 }
4946
4947                 for (*end-- = '\0'; val; val /= base) {
4948                         if ((digit = val % base) <= '9' - '0') {
4949                                 *end-- = '0' + digit;
4950                         } else {
4951                                 *end-- = 'a' + (digit - ('9' - '0') - 1);
4952                         }
4953                 }
4954
4955                 if (i == 0 && base == 16)
4956                         *end-- = '0';
4957
4958                 if (base == 16)
4959                         *end-- = 'x';
4960
4961                 if (i == 0 || base == 8 || base == 16)
4962                         *end-- = '0';
4963
4964                 if (i < 0 && base == 10)
4965                         *end-- = '-';
4966
4967                 regs[rd] = (uintptr_t)end + 1;
4968                 mstate->dtms_scratch_ptr += size;
4969                 break;
4970         }
4971
4972         case DIF_SUBR_HTONS:
4973         case DIF_SUBR_NTOHS:
4974 #ifdef _BIG_ENDIAN
4975                 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4976 #else
4977                 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4978 #endif
4979                 break;
4980
4981
4982         case DIF_SUBR_HTONL:
4983         case DIF_SUBR_NTOHL:
4984 #ifdef _BIG_ENDIAN
4985                 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4986 #else
4987                 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4988 #endif
4989                 break;
4990
4991
4992         case DIF_SUBR_HTONLL:
4993         case DIF_SUBR_NTOHLL:
4994 #ifdef _BIG_ENDIAN
4995                 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4996 #else
4997                 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4998 #endif
4999                 break;
5000
5001
5002         case DIF_SUBR_DIRNAME:
5003         case DIF_SUBR_BASENAME: {
5004                 char *dest = (char *)mstate->dtms_scratch_ptr;
5005                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5006                 uintptr_t src = tupregs[0].dttk_value;
5007                 int i, j, len = dtrace_strlen((char *)src, size);
5008                 int lastbase = -1, firstbase = -1, lastdir = -1;
5009                 int start, end;
5010
5011                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5012                         regs[rd] = 0;
5013                         break;
5014                 }
5015
5016                 if (!DTRACE_INSCRATCH(mstate, size)) {
5017                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5018                         regs[rd] = 0;
5019                         break;
5020                 }
5021
5022                 /*
5023                  * The basename and dirname for a zero-length string is
5024                  * defined to be "."
5025                  */
5026                 if (len == 0) {
5027                         len = 1;
5028                         src = (uintptr_t)".";
5029                 }
5030
5031                 /*
5032                  * Start from the back of the string, moving back toward the
5033                  * front until we see a character that isn't a slash.  That
5034                  * character is the last character in the basename.
5035                  */
5036                 for (i = len - 1; i >= 0; i--) {
5037                         if (dtrace_load8(src + i) != '/')
5038                                 break;
5039                 }
5040
5041                 if (i >= 0)
5042                         lastbase = i;
5043
5044                 /*
5045                  * Starting from the last character in the basename, move
5046                  * towards the front until we find a slash.  The character
5047                  * that we processed immediately before that is the first
5048                  * character in the basename.
5049                  */
5050                 for (; i >= 0; i--) {
5051                         if (dtrace_load8(src + i) == '/')
5052                                 break;
5053                 }
5054
5055                 if (i >= 0)
5056                         firstbase = i + 1;
5057
5058                 /*
5059                  * Now keep going until we find a non-slash character.  That
5060                  * character is the last character in the dirname.
5061                  */
5062                 for (; i >= 0; i--) {
5063                         if (dtrace_load8(src + i) != '/')
5064                                 break;
5065                 }
5066
5067                 if (i >= 0)
5068                         lastdir = i;
5069
5070                 ASSERT(!(lastbase == -1 && firstbase != -1));
5071                 ASSERT(!(firstbase == -1 && lastdir != -1));
5072
5073                 if (lastbase == -1) {
5074                         /*
5075                          * We didn't find a non-slash character.  We know that
5076                          * the length is non-zero, so the whole string must be
5077                          * slashes.  In either the dirname or the basename
5078                          * case, we return '/'.
5079                          */
5080                         ASSERT(firstbase == -1);
5081                         firstbase = lastbase = lastdir = 0;
5082                 }
5083
5084                 if (firstbase == -1) {
5085                         /*
5086                          * The entire string consists only of a basename
5087                          * component.  If we're looking for dirname, we need
5088                          * to change our string to be just "."; if we're
5089                          * looking for a basename, we'll just set the first
5090                          * character of the basename to be 0.
5091                          */
5092                         if (subr == DIF_SUBR_DIRNAME) {
5093                                 ASSERT(lastdir == -1);
5094                                 src = (uintptr_t)".";
5095                                 lastdir = 0;
5096                         } else {
5097                                 firstbase = 0;
5098                         }
5099                 }
5100
5101                 if (subr == DIF_SUBR_DIRNAME) {
5102                         if (lastdir == -1) {
5103                                 /*
5104                                  * We know that we have a slash in the name --
5105                                  * or lastdir would be set to 0, above.  And
5106                                  * because lastdir is -1, we know that this
5107                                  * slash must be the first character.  (That
5108                                  * is, the full string must be of the form
5109                                  * "/basename".)  In this case, the last
5110                                  * character of the directory name is 0.
5111                                  */
5112                                 lastdir = 0;
5113                         }
5114
5115                         start = 0;
5116                         end = lastdir;
5117                 } else {
5118                         ASSERT(subr == DIF_SUBR_BASENAME);
5119                         ASSERT(firstbase != -1 && lastbase != -1);
5120                         start = firstbase;
5121                         end = lastbase;
5122                 }
5123
5124                 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
5125                         dest[j] = dtrace_load8(src + i);
5126
5127                 dest[j] = '\0';
5128                 regs[rd] = (uintptr_t)dest;
5129                 mstate->dtms_scratch_ptr += size;
5130                 break;
5131         }
5132
5133         case DIF_SUBR_CLEANPATH: {
5134                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5135                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5136                 uintptr_t src = tupregs[0].dttk_value;
5137                 size_t lim;
5138                 size_t i = 0, j = 0;
5139
5140                 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5141                         regs[rd] = 0;
5142                         break;
5143                 }
5144
5145                 if (!DTRACE_INSCRATCH(mstate, size)) {
5146                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5147                         regs[rd] = 0;
5148                         break;
5149                 }
5150
5151                 /*
5152                  * Move forward, loading each character.
5153                  */
5154                 do {
5155                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5156 next:
5157                         if ((uint64_t)(j + 5) >= size)  /* 5 = strlen("/..c\0") */
5158                                 break;
5159
5160                         if (c != '/') {
5161                                 dest[j++] = c;
5162                                 continue;
5163                         }
5164
5165                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5166
5167                         if (c == '/') {
5168                                 /*
5169                                  * We have two slashes -- we can just advance
5170                                  * to the next character.
5171                                  */
5172                                 goto next;
5173                         }
5174
5175                         if (c != '.') {
5176                                 /*
5177                                  * This is not "." and it's not ".." -- we can
5178                                  * just store the "/" and this character and
5179                                  * drive on.
5180                                  */
5181                                 dest[j++] = '/';
5182                                 dest[j++] = c;
5183                                 continue;
5184                         }
5185
5186                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5187
5188                         if (c == '/') {
5189                                 /*
5190                                  * This is a "/./" component.  We're not going
5191                                  * to store anything in the destination buffer;
5192                                  * we're just going to go to the next component.
5193                                  */
5194                                 goto next;
5195                         }
5196
5197                         if (c != '.') {
5198                                 /*
5199                                  * This is not ".." -- we can just store the
5200                                  * "/." and this character and continue
5201                                  * processing.
5202                                  */
5203                                 dest[j++] = '/';
5204                                 dest[j++] = '.';
5205                                 dest[j++] = c;
5206                                 continue;
5207                         }
5208
5209                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5210
5211                         if (c != '/' && c != '\0') {
5212                                 /*
5213                                  * This is not ".." -- it's "..[mumble]".
5214                                  * We'll store the "/.." and this character
5215                                  * and continue processing.
5216                                  */
5217                                 dest[j++] = '/';
5218                                 dest[j++] = '.';
5219                                 dest[j++] = '.';
5220                                 dest[j++] = c;
5221                                 continue;
5222                         }
5223
5224                         /*
5225                          * This is "/../" or "/..\0".  We need to back up
5226                          * our destination pointer until we find a "/".
5227                          */
5228                         i--;
5229                         while (j != 0 && dest[--j] != '/')
5230                                 continue;
5231
5232                         if (c == '\0')
5233                                 dest[++j] = '/';
5234                 } while (c != '\0');
5235
5236                 dest[j] = '\0';
5237                 regs[rd] = (uintptr_t)dest;
5238                 mstate->dtms_scratch_ptr += size;
5239                 break;
5240         }
5241
5242         case DIF_SUBR_INET_NTOA:
5243         case DIF_SUBR_INET_NTOA6:
5244         case DIF_SUBR_INET_NTOP: {
5245                 size_t size;
5246                 int af, argi, i;
5247                 char *base, *end;
5248
5249                 if (subr == DIF_SUBR_INET_NTOP) {
5250                         af = (int)tupregs[0].dttk_value;
5251                         argi = 1;
5252                 } else {
5253                         af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5254                         argi = 0;
5255                 }
5256
5257                 if (af == AF_INET) {
5258 #if !defined(__APPLE__)
5259                         ipaddr_t ip4;
5260 #else
5261                         uint32_t ip4;
5262 #endif /* __APPLE__ */
5263                         uint8_t *ptr8, val;
5264
5265                         /*
5266                          * Safely load the IPv4 address.
5267                          */
5268 #if !defined(__APPLE__)
5269                         ip4 = dtrace_load32(tupregs[argi].dttk_value);
5270 #else
5271                         if (!dtrace_canload(tupregs[argi].dttk_value, sizeof(ip4),
5272                                 mstate, vstate)) {
5273                                 regs[rd] = 0;
5274                                 break;
5275                         }
5276
5277                         dtrace_bcopy(
5278                             (void *)(uintptr_t)tupregs[argi].dttk_value,
5279                             (void *)(uintptr_t)&ip4, sizeof (ip4));
5280 #endif /* __APPLE__ */
5281                         /*
5282                          * Check an IPv4 string will fit in scratch.
5283                          */
5284 #if !defined(__APPLE__)
5285                         size = INET_ADDRSTRLEN;
5286 #else
5287                         size = MAX_IPv4_STR_LEN;
5288 #endif /* __APPLE__ */
5289                         if (!DTRACE_INSCRATCH(mstate, size)) {
5290                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5291                                 regs[rd] = 0;
5292                                 break;
5293                         }
5294                         base = (char *)mstate->dtms_scratch_ptr;
5295                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
5296
5297                         /*
5298                          * Stringify as a dotted decimal quad.
5299                          */
5300                         *end-- = '\0';
5301                         ptr8 = (uint8_t *)&ip4;
5302                         for (i = 3; i >= 0; i--) {
5303                                 val = ptr8[i];
5304
5305                                 if (val == 0) {
5306                                         *end-- = '0';
5307                                 } else {
5308                                         for (; val; val /= 10) {
5309                                                 *end-- = '0' + (val % 10);
5310                                         }
5311                                 }
5312
5313                                 if (i > 0)
5314                                         *end-- = '.';
5315                         }
5316                         ASSERT(end + 1 >= base);
5317
5318                 } else if (af == AF_INET6) {
5319 #if defined(__APPLE__)
5320 #define _S6_un __u6_addr
5321 #define _S6_u8 __u6_addr8
5322 #endif /* __APPLE__ */
5323                         struct in6_addr ip6;
5324                         int firstzero, tryzero, numzero, v6end;
5325                         uint16_t val;
5326                         const char digits[] = "0123456789abcdef";
5327
5328                         /*
5329                          * Stringify using RFC 1884 convention 2 - 16 bit
5330                          * hexadecimal values with a zero-run compression.
5331                          * Lower case hexadecimal digits are used.
5332                          *      eg, fe80::214:4fff:fe0b:76c8.
5333                          * The IPv4 embedded form is returned for inet_ntop,
5334                          * just the IPv4 string is returned for inet_ntoa6.
5335                          */
5336
5337                         if (!dtrace_canload(tupregs[argi].dttk_value,
5338                                 sizeof(struct in6_addr), mstate, vstate)) {
5339                                 regs[rd] = 0;
5340                                 break;
5341                         }
5342
5343                         /*
5344                          * Safely load the IPv6 address.
5345                          */
5346                         dtrace_bcopy(
5347                             (void *)(uintptr_t)tupregs[argi].dttk_value,
5348                             (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5349
5350                         /*
5351                          * Check an IPv6 string will fit in scratch.
5352                          */
5353                         size = INET6_ADDRSTRLEN;
5354                         if (!DTRACE_INSCRATCH(mstate, size)) {
5355                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5356                                 regs[rd] = 0;
5357                                 break;
5358                         }
5359                         base = (char *)mstate->dtms_scratch_ptr;
5360                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
5361                         *end-- = '\0';
5362
5363                         /*
5364                          * Find the longest run of 16 bit zero values
5365                          * for the single allowed zero compression - "::".
5366                          */
5367                         firstzero = -1;
5368                         tryzero = -1;
5369                         numzero = 1;
5370                         for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
5371                                 if (ip6._S6_un._S6_u8[i] == 0 &&
5372                                     tryzero == -1 && i % 2 == 0) {
5373                                         tryzero = i;
5374                                         continue;
5375                                 }
5376
5377                                 if (tryzero != -1 &&
5378                                     (ip6._S6_un._S6_u8[i] != 0 ||
5379                                     i == sizeof (struct in6_addr) - 1)) {
5380
5381                                         if (i - tryzero <= numzero) {
5382                                                 tryzero = -1;
5383                                                 continue;
5384                                         }
5385
5386                                         firstzero = tryzero;
5387                                         numzero = i - i % 2 - tryzero;
5388                                         tryzero = -1;
5389
5390                                         if (ip6._S6_un._S6_u8[i] == 0 &&
5391                                             i == sizeof (struct in6_addr) - 1)
5392                                                 numzero += 2;
5393                                 }
5394                         }
5395                         ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
5396
5397                         /*
5398                          * Check for an IPv4 embedded address.
5399                          */
5400                         v6end = sizeof (struct in6_addr) - 2;
5401                         if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5402                             IN6_IS_ADDR_V4COMPAT(&ip6)) {
5403                                 for (i = sizeof (struct in6_addr) - 1;
5404                                      i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
5405                                         ASSERT(end >= base);
5406
5407                                         val = ip6._S6_un._S6_u8[i];
5408
5409                                         if (val == 0) {
5410                                                 *end-- = '0';
5411                                         } else {
5412                                                 for (; val; val /= 10) {
5413                                                         *end-- = '0' + val % 10;
5414                                                 }
5415                                         }
5416
5417                                         if (i > (int)DTRACE_V4MAPPED_OFFSET)
5418                                                 *end-- = '.';
5419                                 }
5420
5421                                 if (subr == DIF_SUBR_INET_NTOA6)
5422                                         goto inetout;
5423
5424                                 /*
5425                                  * Set v6end to skip the IPv4 address that
5426                                  * we have already stringified.
5427                                  */
5428                                 v6end = 10;
5429                         }
5430
5431                         /*
5432                          * Build the IPv6 string by working through the
5433                          * address in reverse.
5434                          */
5435                         for (i = v6end; i >= 0; i -= 2) {
5436                                 ASSERT(end >= base);
5437
5438                                 if (i == firstzero + numzero - 2) {
5439                                         *end-- = ':';
5440                                         *end-- = ':';
5441                                         i -= numzero - 2;
5442                                         continue;
5443                                 }
5444
5445                                 if (i < 14 && i != firstzero - 2)
5446                                         *end-- = ':';
5447
5448                                 val = (ip6._S6_un._S6_u8[i] << 8) +
5449                                     ip6._S6_un._S6_u8[i + 1];
5450
5451                                 if (val == 0) {
5452                                         *end-- = '0';
5453                                 } else {
5454                                         for (; val; val /= 16) {
5455                                                 *end-- = digits[val % 16];
5456                                         }
5457                                 }
5458                         }
5459                         ASSERT(end + 1 >= base);
5460
5461 #if defined(__APPLE__)
5462 #undef _S6_un
5463 #undef _S6_u8
5464 #endif /* __APPLE__ */
5465                 } else {
5466                         /*
5467                          * The user didn't use AH_INET or AH_INET6.
5468                          */
5469                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5470                         regs[rd] = 0;
5471                         break;
5472                 }
5473
5474 inetout:        regs[rd] = (uintptr_t)end + 1;
5475                 mstate->dtms_scratch_ptr += size;
5476                 break;
5477         }
5478
5479         case DIF_SUBR_JSON: {
5480                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5481                 uintptr_t json = tupregs[0].dttk_value;
5482                 size_t jsonlen = dtrace_strlen((char *)json, size);
5483                 uintptr_t elem = tupregs[1].dttk_value;
5484                 size_t elemlen = dtrace_strlen((char *)elem, size);
5485
5486                 char *dest = (char *)mstate->dtms_scratch_ptr;
5487                 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
5488                 char *ee = elemlist;
5489                 int nelems = 1;
5490                 uintptr_t cur;
5491
5492                 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
5493                     !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
5494                         regs[rd] = 0;
5495                         break;
5496                 }
5497
5498                 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
5499                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5500                         regs[rd] = 0;
5501                         break;
5502                 }
5503
5504                 /*
5505                  * Read the element selector and split it up into a packed list
5506                  * of strings.
5507                  */
5508                 for (cur = elem; cur < elem + elemlen; cur++) {
5509                         char cc = dtrace_load8(cur);
5510
5511                         if (cur == elem && cc == '[') {
5512                                 /*
5513                                  * If the first element selector key is
5514                                  * actually an array index then ignore the
5515                                  * bracket.
5516                                  */
5517                                 continue;
5518                         }
5519
5520                         if (cc == ']')
5521                                 continue;
5522
5523                         if (cc == '.' || cc == '[') {
5524                                 nelems++;
5525                                 cc = '\0';
5526                         }
5527
5528                         *ee++ = cc;
5529                 }
5530                 *ee++ = '\0';
5531
5532                 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
5533                     nelems, dest)) != 0)
5534                         mstate->dtms_scratch_ptr += jsonlen + 1;
5535                 break;
5536         }
5537
5538         case DIF_SUBR_TOUPPER:
5539         case DIF_SUBR_TOLOWER: {
5540                 uintptr_t src = tupregs[0].dttk_value;
5541                 char *dest = (char *)mstate->dtms_scratch_ptr;
5542                 char lower, upper, base, c;
5543                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5544                 size_t len = dtrace_strlen((char*) src, size);
5545                 size_t i = 0;
5546
5547                 lower = (subr == DIF_SUBR_TOUPPER) ? 'a' : 'A';
5548                 upper = (subr == DIF_SUBR_TOUPPER) ? 'z' : 'Z';
5549                 base  = (subr == DIF_SUBR_TOUPPER) ? 'A' : 'a';
5550
5551                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5552                         regs[rd] = 0;
5553                         break;
5554                 }
5555
5556                 if (!DTRACE_INSCRATCH(mstate, size)) {
5557                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5558                         regs[rd] = 0;
5559                         break;
5560                 }
5561
5562                 for (i = 0; i < size - 1; ++i) {
5563                         if ((c = dtrace_load8(src + i)) == '\0')
5564                                 break;
5565                         if (c >= lower && c <= upper)
5566                                 c = base + (c - lower);
5567                         dest[i] = c;
5568                 }
5569
5570                 ASSERT(i < size);
5571
5572                 dest[i] = '\0';
5573                 regs[rd] = (uintptr_t) dest;
5574                 mstate->dtms_scratch_ptr += size;
5575
5576                 break;
5577         }
5578         case DIF_SUBR_STRIP:
5579                 if (!dtrace_is_valid_ptrauth_key(tupregs[1].dttk_value)) {
5580                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5581                         break;
5582                 }
5583                 regs[rd] = (uint64_t)dtrace_ptrauth_strip(
5584                     (void*)tupregs[0].dttk_value, tupregs[1].dttk_value);
5585                 break;
5586
5587 #if defined(__APPLE__)
5588         case DIF_SUBR_VM_KERNEL_ADDRPERM: {
5589                 if (!dtrace_priv_kernel(state)) {
5590                         regs[rd] = 0;
5591                 } else {
5592                         regs[rd] = VM_KERNEL_ADDRPERM((vm_offset_t) tupregs[0].dttk_value);
5593                 }
5594
5595                 break;
5596         }
5597
5598         case DIF_SUBR_KDEBUG_TRACE: {
5599                 uint32_t debugid;
5600                 uintptr_t args[4] = {0};
5601                 int i;
5602
5603                 if (nargs < 2 || nargs > 5) {
5604                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5605                         break;
5606                 }
5607
5608                 if (dtrace_destructive_disallow)
5609                         return;
5610
5611                 debugid = tupregs[0].dttk_value;
5612                 for (i = 0; i < nargs - 1; i++)
5613                         args[i] = tupregs[i + 1].dttk_value;
5614
5615                 kernel_debug(debugid, args[0], args[1], args[2], args[3], 0);
5616
5617                 break;
5618         }
5619
5620         case DIF_SUBR_KDEBUG_TRACE_STRING: {
5621                 if (nargs != 3) {
5622                         break;
5623                 }
5624
5625                 if (dtrace_destructive_disallow)
5626                         return;
5627
5628                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5629                 uint32_t debugid = tupregs[0].dttk_value;
5630                 uint64_t str_id = tupregs[1].dttk_value;
5631                 uintptr_t src = tupregs[2].dttk_value;
5632                 size_t lim;
5633                 char buf[size];
5634                 char* str = NULL;
5635
5636                 if (src != (uintptr_t)0) {
5637                         str = buf;
5638                         if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5639                                 break;
5640                         }
5641                         dtrace_strcpy((void*)src, buf, size);
5642                 }
5643
5644                 (void)kernel_debug_string(debugid, &str_id, str);
5645                 regs[rd] = str_id;
5646
5647                 break;
5648         }
5649 #endif
5650
5651         }
5652 }
5653
5654 /*
5655  * Emulate the execution of DTrace IR instructions specified by the given
5656  * DIF object.  This function is deliberately void of assertions as all of
5657  * the necessary checks are handled by a call to dtrace_difo_validate().
5658  */
5659 static uint64_t
5660 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5661     dtrace_vstate_t *vstate, dtrace_state_t *state)
5662 {
5663         const dif_instr_t *text = difo->dtdo_buf;
5664         const uint_t textlen = difo->dtdo_len;
5665         const char *strtab = difo->dtdo_strtab;
5666         const uint64_t *inttab = difo->dtdo_inttab;
5667
5668         uint64_t rval = 0;
5669         dtrace_statvar_t *svar;
5670         dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5671         dtrace_difv_t *v;
5672         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5673         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5674
5675         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5676         uint64_t regs[DIF_DIR_NREGS];
5677         uint64_t *tmp;
5678
5679         uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5680         int64_t cc_r;
5681         uint_t pc = 0, id, opc = 0;
5682         uint8_t ttop = 0;
5683         dif_instr_t instr;
5684         uint_t r1, r2, rd;
5685
5686         /*
5687          * We stash the current DIF object into the machine state: we need it
5688          * for subsequent access checking.
5689          */
5690         mstate->dtms_difo = difo;
5691
5692         regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
5693
5694         while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5695                 opc = pc;
5696
5697                 instr = text[pc++];
5698                 r1 = DIF_INSTR_R1(instr);
5699                 r2 = DIF_INSTR_R2(instr);
5700                 rd = DIF_INSTR_RD(instr);
5701
5702                 switch (DIF_INSTR_OP(instr)) {
5703                 case DIF_OP_OR:
5704                         regs[rd] = regs[r1] | regs[r2];
5705                         break;
5706                 case DIF_OP_XOR:
5707                         regs[rd] = regs[r1] ^ regs[r2];
5708                         break;
5709                 case DIF_OP_AND:
5710                         regs[rd] = regs[r1] & regs[r2];
5711                         break;
5712                 case DIF_OP_SLL:
5713                         regs[rd] = regs[r1] << regs[r2];
5714                         break;
5715                 case DIF_OP_SRL:
5716                         regs[rd] = regs[r1] >> regs[r2];
5717                         break;
5718                 case DIF_OP_SUB:
5719                         regs[rd] = regs[r1] - regs[r2];
5720                         break;
5721                 case DIF_OP_ADD:
5722                         regs[rd] = regs[r1] + regs[r2];
5723                         break;
5724                 case DIF_OP_MUL:
5725                         regs[rd] = regs[r1] * regs[r2];
5726                         break;
5727                 case DIF_OP_SDIV:
5728                         if (regs[r2] == 0) {
5729                                 regs[rd] = 0;
5730                                 *flags |= CPU_DTRACE_DIVZERO;
5731                         } else {
5732                                 regs[rd] = (int64_t)regs[r1] /
5733                                     (int64_t)regs[r2];
5734                         }
5735                         break;
5736
5737                 case DIF_OP_UDIV:
5738                         if (regs[r2] == 0) {
5739                                 regs[rd] = 0;
5740                                 *flags |= CPU_DTRACE_DIVZERO;
5741                         } else {
5742                                 regs[rd] = regs[r1] / regs[r2];
5743                         }
5744                         break;
5745
5746                 case DIF_OP_SREM:
5747                         if (regs[r2] == 0) {
5748                                 regs[rd] = 0;
5749                                 *flags |= CPU_DTRACE_DIVZERO;
5750                         } else {
5751                                 regs[rd] = (int64_t)regs[r1] %
5752                                     (int64_t)regs[r2];
5753                         }
5754                         break;
5755
5756                 case DIF_OP_UREM:
5757                         if (regs[r2] == 0) {
5758                                 regs[rd] = 0;
5759                                 *flags |= CPU_DTRACE_DIVZERO;
5760                         } else {
5761                                 regs[rd] = regs[r1] % regs[r2];
5762                         }
5763                         break;
5764
5765                 case DIF_OP_NOT:
5766                         regs[rd] = ~regs[r1];
5767                         break;
5768                 case DIF_OP_MOV:
5769                         regs[rd] = regs[r1];
5770                         break;
5771                 case DIF_OP_CMP:
5772                         cc_r = regs[r1] - regs[r2];
5773                         cc_n = cc_r < 0;
5774                         cc_z = cc_r == 0;
5775                         cc_v = 0;
5776                         cc_c = regs[r1] < regs[r2];
5777                         break;
5778                 case DIF_OP_TST:
5779                         cc_n = cc_v = cc_c = 0;
5780                         cc_z = regs[r1] == 0;
5781                         break;
5782                 case DIF_OP_BA:
5783                         pc = DIF_INSTR_LABEL(instr);
5784                         break;
5785                 case DIF_OP_BE:
5786                         if (cc_z)
5787                                 pc = DIF_INSTR_LABEL(instr);
5788                         break;
5789                 case DIF_OP_BNE:
5790                         if (cc_z == 0)
5791                                 pc = DIF_INSTR_LABEL(instr);
5792                         break;
5793                 case DIF_OP_BG:
5794                         if ((cc_z | (cc_n ^ cc_v)) == 0)
5795                                 pc = DIF_INSTR_LABEL(instr);
5796                         break;
5797                 case DIF_OP_BGU:
5798                         if ((cc_c | cc_z) == 0)
5799                                 pc = DIF_INSTR_LABEL(instr);
5800                         break;
5801                 case DIF_OP_BGE:
5802                         if ((cc_n ^ cc_v) == 0)
5803                                 pc = DIF_INSTR_LABEL(instr);
5804                         break;
5805                 case DIF_OP_BGEU:
5806                         if (cc_c == 0)
5807                                 pc = DIF_INSTR_LABEL(instr);
5808                         break;
5809                 case DIF_OP_BL:
5810                         if (cc_n ^ cc_v)
5811                                 pc = DIF_INSTR_LABEL(instr);
5812                         break;
5813                 case DIF_OP_BLU:
5814                         if (cc_c)
5815                                 pc = DIF_INSTR_LABEL(instr);
5816                         break;
5817                 case DIF_OP_BLE:
5818                         if (cc_z | (cc_n ^ cc_v))
5819                                 pc = DIF_INSTR_LABEL(instr);
5820                         break;
5821                 case DIF_OP_BLEU:
5822                         if (cc_c | cc_z)
5823                                 pc = DIF_INSTR_LABEL(instr);
5824                         break;
5825                 case DIF_OP_RLDSB:
5826                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5827                                 *flags |= CPU_DTRACE_KPRIV;
5828                                 *illval = regs[r1];
5829                                 break;
5830                         }
5831                         /*FALLTHROUGH*/
5832                 case DIF_OP_LDSB:
5833                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5834                         break;
5835                 case DIF_OP_RLDSH:
5836                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5837                                 *flags |= CPU_DTRACE_KPRIV;
5838                                 *illval = regs[r1];
5839                                 break;
5840                         }
5841                         /*FALLTHROUGH*/
5842                 case DIF_OP_LDSH:
5843                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5844                         break;
5845                 case DIF_OP_RLDSW:
5846                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5847                                 *flags |= CPU_DTRACE_KPRIV;
5848                                 *illval = regs[r1];
5849                                 break;
5850                         }
5851                         /*FALLTHROUGH*/
5852                 case DIF_OP_LDSW:
5853                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5854                         break;
5855                 case DIF_OP_RLDUB:
5856                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5857                                 *flags |= CPU_DTRACE_KPRIV;
5858                                 *illval = regs[r1];
5859                                 break;
5860                         }
5861                         /*FALLTHROUGH*/
5862                 case DIF_OP_LDUB:
5863                         regs[rd] = dtrace_load8(regs[r1]);
5864                         break;
5865                 case DIF_OP_RLDUH:
5866                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5867                                 *flags |= CPU_DTRACE_KPRIV;
5868                                 *illval = regs[r1];
5869                                 break;
5870                         }
5871                         /*FALLTHROUGH*/
5872                 case DIF_OP_LDUH:
5873                         regs[rd] = dtrace_load16(regs[r1]);
5874                         break;
5875                 case DIF_OP_RLDUW:
5876                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5877                                 *flags |= CPU_DTRACE_KPRIV;
5878                                 *illval = regs[r1];
5879                                 break;
5880                         }
5881                         /*FALLTHROUGH*/
5882                 case DIF_OP_LDUW:
5883                         regs[rd] = dtrace_load32(regs[r1]);
5884                         break;
5885                 case DIF_OP_RLDX:
5886                         if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5887                                 *flags |= CPU_DTRACE_KPRIV;
5888                                 *illval = regs[r1];
5889                                 break;
5890                         }
5891                         /*FALLTHROUGH*/
5892                 case DIF_OP_LDX:
5893                         regs[rd] = dtrace_load64(regs[r1]);
5894                         break;
5895 /*
5896  * Darwin 32-bit kernel may fetch from 64-bit user.
5897  * Do not cast regs to uintptr_t
5898  * DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
5899  * DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
5900  */
5901                 case DIF_OP_ULDSB:
5902                         regs[rd] = (int8_t)
5903                             dtrace_fuword8(regs[r1]);
5904                         break;
5905                 case DIF_OP_ULDSH:
5906                         regs[rd] = (int16_t)
5907                             dtrace_fuword16(regs[r1]);
5908                         break;
5909                 case DIF_OP_ULDSW:
5910                         regs[rd] = (int32_t)
5911                             dtrace_fuword32(regs[r1]);
5912                         break;
5913                 case DIF_OP_ULDUB:
5914                         regs[rd] =
5915                             dtrace_fuword8(regs[r1]);
5916                         break;
5917                 case DIF_OP_ULDUH:
5918                         regs[rd] =
5919                             dtrace_fuword16(regs[r1]);
5920                         break;
5921                 case DIF_OP_ULDUW:
5922                         regs[rd] =
5923                             dtrace_fuword32(regs[r1]);
5924                         break;
5925                 case DIF_OP_ULDX:
5926                         regs[rd] =
5927                             dtrace_fuword64(regs[r1]);
5928                         break;
5929                 case DIF_OP_RET:
5930                         rval = regs[rd];
5931                         pc = textlen;
5932                         break;
5933                 case DIF_OP_NOP:
5934                         break;
5935                 case DIF_OP_SETX:
5936                         regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5937                         break;
5938                 case DIF_OP_SETS:
5939                         regs[rd] = (uint64_t)(uintptr_t)
5940                             (strtab + DIF_INSTR_STRING(instr));
5941                         break;
5942                 case DIF_OP_SCMP: {
5943                         size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5944                         uintptr_t s1 = regs[r1];
5945                         uintptr_t s2 = regs[r2];
5946                         size_t lim1 = sz, lim2 = sz;
5947
5948                         if (s1 != 0 &&
5949                             !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
5950                                 break;
5951                         if (s2 != 0 &&
5952                             !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
5953                                 break;
5954
5955                         cc_r = dtrace_strncmp((char *)s1, (char *)s2,
5956                                 MIN(lim1, lim2));
5957
5958                         cc_n = cc_r < 0;
5959                         cc_z = cc_r == 0;
5960                         cc_v = cc_c = 0;
5961                         break;
5962                 }
5963                 case DIF_OP_LDGA:
5964                         regs[rd] = dtrace_dif_variable(mstate, state,
5965                             r1, regs[r2]);
5966                         break;
5967                 case DIF_OP_LDGS:
5968                         id = DIF_INSTR_VAR(instr);
5969
5970                         if (id >= DIF_VAR_OTHER_UBASE) {
5971                                 uintptr_t a;
5972
5973                                 id -= DIF_VAR_OTHER_UBASE;
5974                                 svar = vstate->dtvs_globals[id];
5975                                 ASSERT(svar != NULL);
5976                                 v = &svar->dtsv_var;
5977
5978                                 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5979                                         regs[rd] = svar->dtsv_data;
5980                                         break;
5981                                 }
5982
5983                                 a = (uintptr_t)svar->dtsv_data;
5984
5985                                 if (*(uint8_t *)a == UINT8_MAX) {
5986                                         /*
5987                                          * If the 0th byte is set to UINT8_MAX
5988                                          * then this is to be treated as a
5989                                          * reference to a NULL variable.
5990                                          */
5991                                         regs[rd] = 0;
5992                                 } else {
5993                                         regs[rd] = a + sizeof (uint64_t);
5994                                 }
5995
5996                                 break;
5997                         }
5998
5999                         regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6000                         break;
6001
6002                 case DIF_OP_STGS:
6003                         id = DIF_INSTR_VAR(instr);
6004
6005                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6006                         id -= DIF_VAR_OTHER_UBASE;
6007
6008                         VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6009                         svar = vstate->dtvs_globals[id];
6010                         ASSERT(svar != NULL);
6011                         v = &svar->dtsv_var;
6012
6013                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6014                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6015                                 size_t lim;
6016
6017                                 ASSERT(a != 0);
6018                                 ASSERT(svar->dtsv_size != 0);
6019
6020                                 if (regs[rd] == 0) {
6021                                         *(uint8_t *)a = UINT8_MAX;
6022                                         break;
6023                                 } else {
6024                                         *(uint8_t *)a = 0;
6025                                         a += sizeof (uint64_t);
6026                                 }
6027                                 if (!dtrace_vcanload(
6028                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6029                                         &lim, mstate, vstate))
6030                                         break;
6031
6032                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6033                                     (void *)a, &v->dtdv_type, lim);
6034                                 break;
6035                         }
6036
6037                         svar->dtsv_data = regs[rd];
6038                         break;
6039
6040                 case DIF_OP_LDTA:
6041                         /*
6042                          * There are no DTrace built-in thread-local arrays at
6043                          * present.  This opcode is saved for future work.
6044                          */
6045                         *flags |= CPU_DTRACE_ILLOP;
6046                         regs[rd] = 0;
6047                         break;
6048
6049                 case DIF_OP_LDLS:
6050                         id = DIF_INSTR_VAR(instr);
6051
6052                         if (id < DIF_VAR_OTHER_UBASE) {
6053                                 /*
6054                                  * For now, this has no meaning.
6055                                  */
6056                                 regs[rd] = 0;
6057                                 break;
6058                         }
6059
6060                         id -= DIF_VAR_OTHER_UBASE;
6061
6062                         ASSERT(id < (uint_t)vstate->dtvs_nlocals);
6063                         ASSERT(vstate->dtvs_locals != NULL);
6064                         svar = vstate->dtvs_locals[id];
6065                         ASSERT(svar != NULL);
6066                         v = &svar->dtsv_var;
6067
6068                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6069                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6070                                 size_t sz = v->dtdv_type.dtdt_size;
6071
6072                                 sz += sizeof (uint64_t);
6073                                 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6074                                 a += CPU->cpu_id * sz;
6075
6076                                 if (*(uint8_t *)a == UINT8_MAX) {
6077                                         /*
6078                                          * If the 0th byte is set to UINT8_MAX
6079                                          * then this is to be treated as a
6080                                          * reference to a NULL variable.
6081                                          */
6082                                         regs[rd] = 0;
6083                                 } else {
6084                                         regs[rd] = a + sizeof (uint64_t);
6085                                 }
6086
6087                                 break;
6088                         }
6089
6090                         ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6091                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6092                         regs[rd] = tmp[CPU->cpu_id];
6093                         break;
6094
6095                 case DIF_OP_STLS:
6096                         id = DIF_INSTR_VAR(instr);
6097
6098                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6099                         id -= DIF_VAR_OTHER_UBASE;
6100                         VERIFY(id < (uint_t)vstate->dtvs_nlocals);
6101                         ASSERT(vstate->dtvs_locals != NULL);
6102                         svar = vstate->dtvs_locals[id];
6103                         ASSERT(svar != NULL);
6104                         v = &svar->dtsv_var;
6105
6106                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6107                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6108                                 size_t sz = v->dtdv_type.dtdt_size;
6109                                 size_t lim;
6110
6111                                 sz += sizeof (uint64_t);
6112                                 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6113                                 a += CPU->cpu_id * sz;
6114
6115                                 if (regs[rd] == 0) {
6116                                         *(uint8_t *)a = UINT8_MAX;
6117                                         break;
6118                                 } else {
6119                                         *(uint8_t *)a = 0;
6120                                         a += sizeof (uint64_t);
6121                                 }
6122
6123                                 if (!dtrace_vcanload(
6124                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6125                                     &lim, mstate, vstate))
6126                                         break;
6127
6128                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6129                                     (void *)a, &v->dtdv_type, lim);
6130                                 break;
6131                         }
6132
6133                         ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6134                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6135                         tmp[CPU->cpu_id] = regs[rd];
6136                         break;
6137
6138                 case DIF_OP_LDTS: {
6139                         dtrace_dynvar_t *dvar;
6140                         dtrace_key_t *key;
6141
6142                         id = DIF_INSTR_VAR(instr);
6143                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6144                         id -= DIF_VAR_OTHER_UBASE;
6145                         v = &vstate->dtvs_tlocals[id];
6146
6147                         key = &tupregs[DIF_DTR_NREGS];
6148                         key[0].dttk_value = (uint64_t)id;
6149                         key[0].dttk_size = 0;
6150                         DTRACE_TLS_THRKEY(key[1].dttk_value);
6151                         key[1].dttk_size = 0;
6152
6153                         dvar = dtrace_dynvar(dstate, 2, key,
6154                             sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6155                             mstate, vstate);
6156
6157                         if (dvar == NULL) {
6158                                 regs[rd] = 0;
6159                                 break;
6160                         }
6161
6162                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6163                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6164                         } else {
6165                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6166                         }
6167
6168                         break;
6169                 }
6170
6171                 case DIF_OP_STTS: {
6172                         dtrace_dynvar_t *dvar;
6173                         dtrace_key_t *key;
6174
6175                         id = DIF_INSTR_VAR(instr);
6176                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6177                         id -= DIF_VAR_OTHER_UBASE;
6178                         VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6179
6180                         key = &tupregs[DIF_DTR_NREGS];
6181                         key[0].dttk_value = (uint64_t)id;
6182                         key[0].dttk_size = 0;
6183                         DTRACE_TLS_THRKEY(key[1].dttk_value);
6184                         key[1].dttk_size = 0;
6185                         v = &vstate->dtvs_tlocals[id];
6186
6187                         dvar = dtrace_dynvar(dstate, 2, key,
6188                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6189                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6190                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
6191                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6192
6193                         /*
6194                          * Given that we're storing to thread-local data,
6195                          * we need to flush our predicate cache.
6196                          */
6197                         dtrace_set_thread_predcache(current_thread(), 0);
6198
6199                         if (dvar == NULL)
6200                                 break;
6201
6202                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6203                                 size_t lim;
6204
6205                                 if (!dtrace_vcanload(
6206                                     (void *)(uintptr_t)regs[rd],
6207                                     &v->dtdv_type, &lim, mstate, vstate))
6208                                         break;
6209
6210                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6211                                     dvar->dtdv_data, &v->dtdv_type, lim);
6212                         } else {
6213                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6214                         }
6215
6216                         break;
6217                 }
6218
6219                 case DIF_OP_SRA:
6220                         regs[rd] = (int64_t)regs[r1] >> regs[r2];
6221                         break;
6222
6223                 case DIF_OP_CALL:
6224                         dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6225                             regs, tupregs, ttop, mstate, state);
6226                         break;
6227
6228                 case DIF_OP_PUSHTR:
6229                         if (ttop == DIF_DTR_NREGS) {
6230                                 *flags |= CPU_DTRACE_TUPOFLOW;
6231                                 break;
6232                         }
6233
6234                         if (r1 == DIF_TYPE_STRING) {
6235                                 /*
6236                                  * If this is a string type and the size is 0,
6237                                  * we'll use the system-wide default string
6238                                  * size.  Note that we are _not_ looking at
6239                                  * the value of the DTRACEOPT_STRSIZE option;
6240                                  * had this been set, we would expect to have
6241                                  * a non-zero size value in the "pushtr".
6242                                  */
6243                                 tupregs[ttop].dttk_size =
6244                                     dtrace_strlen((char *)(uintptr_t)regs[rd],
6245                                     regs[r2] ? regs[r2] :
6246                                     dtrace_strsize_default) + 1;
6247                         } else {
6248                                 if (regs[r2] > LONG_MAX) {
6249                                         *flags |= CPU_DTRACE_ILLOP;
6250                                         break;
6251                                 }
6252                                 tupregs[ttop].dttk_size = regs[r2];
6253                         }
6254
6255                         tupregs[ttop++].dttk_value = regs[rd];
6256                         break;
6257
6258                 case DIF_OP_PUSHTV:
6259                         if (ttop == DIF_DTR_NREGS) {
6260                                 *flags |= CPU_DTRACE_TUPOFLOW;
6261                                 break;
6262                         }
6263
6264                         tupregs[ttop].dttk_value = regs[rd];
6265                         tupregs[ttop++].dttk_size = 0;
6266                         break;
6267
6268                 case DIF_OP_POPTS:
6269                         if (ttop != 0)
6270                                 ttop--;
6271                         break;
6272
6273                 case DIF_OP_FLUSHTS:
6274                         ttop = 0;
6275                         break;
6276
6277                 case DIF_OP_LDGAA:
6278                 case DIF_OP_LDTAA: {
6279                         dtrace_dynvar_t *dvar;
6280                         dtrace_key_t *key = tupregs;
6281                         uint_t nkeys = ttop;
6282
6283                         id = DIF_INSTR_VAR(instr);
6284                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6285                         id -= DIF_VAR_OTHER_UBASE;
6286
6287                         key[nkeys].dttk_value = (uint64_t)id;
6288                         key[nkeys++].dttk_size = 0;
6289
6290                         if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6291                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6292                                 key[nkeys++].dttk_size = 0;
6293                                 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6294                                 v = &vstate->dtvs_tlocals[id];
6295                         } else {
6296                                 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6297                                 v = &vstate->dtvs_globals[id]->dtsv_var;
6298                         }
6299
6300                         dvar = dtrace_dynvar(dstate, nkeys, key,
6301                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6302                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6303                             DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6304
6305                         if (dvar == NULL) {
6306                                 regs[rd] = 0;
6307                                 break;
6308                         }
6309
6310                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6311                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6312                         } else {
6313                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6314                         }
6315
6316                         break;
6317                 }
6318
6319                 case DIF_OP_STGAA:
6320                 case DIF_OP_STTAA: {
6321                         dtrace_dynvar_t *dvar;
6322                         dtrace_key_t *key = tupregs;
6323                         uint_t nkeys = ttop;
6324
6325                         id = DIF_INSTR_VAR(instr);
6326                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6327                         id -= DIF_VAR_OTHER_UBASE;
6328
6329                         key[nkeys].dttk_value = (uint64_t)id;
6330                         key[nkeys++].dttk_size = 0;
6331
6332                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6333                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6334                                 key[nkeys++].dttk_size = 0;
6335                                 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6336                                 v = &vstate->dtvs_tlocals[id];
6337                         } else {
6338                                 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6339                                 v = &vstate->dtvs_globals[id]->dtsv_var;
6340                         }
6341
6342                         dvar = dtrace_dynvar(dstate, nkeys, key,
6343                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6344                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6345                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
6346                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6347
6348                         if (dvar == NULL)
6349                                 break;
6350
6351                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6352                                 size_t lim;
6353
6354                                 if (!dtrace_vcanload(
6355                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6356                                     &lim, mstate, vstate))
6357                                         break;
6358
6359                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6360                                     dvar->dtdv_data, &v->dtdv_type, lim);
6361                         } else {
6362                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6363                         }
6364
6365                         break;
6366                 }
6367
6368                 case DIF_OP_ALLOCS: {
6369                         uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6370                         size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6371
6372                         /*
6373                          * Rounding up the user allocation size could have
6374                          * overflowed large, bogus allocations (like -1ULL) to
6375                          * 0.
6376                          */
6377                         if (size < regs[r1] ||
6378                             !DTRACE_INSCRATCH(mstate, size)) {
6379                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6380                                 regs[rd] = 0;
6381                                 break;
6382                         }
6383
6384                         dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6385                                 mstate->dtms_scratch_ptr += size;
6386                                 regs[rd] = ptr;
6387                         break;
6388                 }
6389
6390                 case DIF_OP_COPYS:
6391                         if (!dtrace_canstore(regs[rd], regs[r2],
6392                             mstate, vstate)) {
6393                                 *flags |= CPU_DTRACE_BADADDR;
6394                                 *illval = regs[rd];
6395                                 break;
6396                         }
6397
6398                         if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6399                                 break;
6400
6401                         dtrace_bcopy((void *)(uintptr_t)regs[r1],
6402                             (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6403                         break;
6404
6405                 case DIF_OP_STB:
6406                         if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6407                                 *flags |= CPU_DTRACE_BADADDR;
6408                                 *illval = regs[rd];
6409                                 break;
6410                         }
6411                         *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6412                         break;
6413
6414                 case DIF_OP_STH:
6415                         if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6416                                 *flags |= CPU_DTRACE_BADADDR;
6417                                 *illval = regs[rd];
6418                                 break;
6419                         }
6420                         if (regs[rd] & 1) {
6421                                 *flags |= CPU_DTRACE_BADALIGN;
6422                                 *illval = regs[rd];
6423                                 break;
6424                         }
6425                         *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6426                         break;
6427
6428                 case DIF_OP_STW:
6429                         if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6430                                 *flags |= CPU_DTRACE_BADADDR;
6431                                 *illval = regs[rd];
6432                                 break;
6433                         }
6434                         if (regs[rd] & 3) {
6435                                 *flags |= CPU_DTRACE_BADALIGN;
6436                                 *illval = regs[rd];
6437                                 break;
6438                         }
6439                         *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6440                         break;
6441
6442                 case DIF_OP_STX:
6443                         if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6444                                 *flags |= CPU_DTRACE_BADADDR;
6445                                 *illval = regs[rd];
6446                                 break;
6447                         }
6448
6449                         /*
6450                         * Darwin kmem_zalloc() called from
6451                         * dtrace_difo_init() is 4-byte aligned.
6452                         */
6453                         if (regs[rd] & 3) {
6454                                 *flags |= CPU_DTRACE_BADALIGN;
6455                                 *illval = regs[rd];
6456                                 break;
6457                         }
6458                         *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6459                         break;
6460                 case DIF_OP_STRIP:
6461                         regs[rd] = (uint64_t)dtrace_ptrauth_strip(
6462                             (void*)regs[r1], r2);
6463                         break;
6464                 }
6465         }
6466
6467         if (!(*flags & CPU_DTRACE_FAULT))
6468                 return (rval);
6469
6470         mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6471         mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6472
6473         return (0);
6474 }
6475
6476 static void
6477 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6478 {
6479         dtrace_probe_t *probe = ecb->dte_probe;
6480         dtrace_provider_t *prov = probe->dtpr_provider;
6481         char c[DTRACE_FULLNAMELEN + 80], *str;
6482         const char *msg = "dtrace: breakpoint action at probe ";
6483         const char *ecbmsg = " (ecb ";
6484         uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6485         uintptr_t val = (uintptr_t)ecb;
6486         int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6487
6488         if (dtrace_destructive_disallow)
6489                 return;
6490
6491         /*
6492          * It's impossible to be taking action on the NULL probe.
6493          */
6494         ASSERT(probe != NULL);
6495
6496         /*
6497          * This is a poor man's (destitute man's?) sprintf():  we want to
6498          * print the provider name, module name, function name and name of
6499          * the probe, along with the hex address of the ECB with the breakpoint
6500          * action -- all of which we must place in the character buffer by
6501          * hand.
6502          */
6503         while (*msg != '\0')
6504                 c[i++] = *msg++;
6505
6506         for (str = prov->dtpv_name; *str != '\0'; str++)
6507                 c[i++] = *str;
6508         c[i++] = ':';
6509
6510         for (str = probe->dtpr_mod; *str != '\0'; str++)
6511                 c[i++] = *str;
6512         c[i++] = ':';
6513
6514         for (str = probe->dtpr_func; *str != '\0'; str++)
6515                 c[i++] = *str;
6516         c[i++] = ':';
6517
6518         for (str = probe->dtpr_name; *str != '\0'; str++)
6519                 c[i++] = *str;
6520
6521         while (*ecbmsg != '\0')
6522                 c[i++] = *ecbmsg++;
6523
6524         while (shift >= 0) {
6525                 mask = (uintptr_t)0xf << shift;
6526
6527                 if (val >= ((uintptr_t)1 << shift))
6528                         c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6529                 shift -= 4;
6530         }
6531
6532         c[i++] = ')';
6533         c[i] = '\0';
6534
6535         debug_enter(c);
6536 }
6537
6538 static void
6539 dtrace_action_panic(dtrace_ecb_t *ecb)
6540 {
6541         dtrace_probe_t *probe = ecb->dte_probe;
6542
6543         /*
6544          * It's impossible to be taking action on the NULL probe.
6545          */
6546         ASSERT(probe != NULL);
6547
6548         if (dtrace_destructive_disallow)
6549                 return;
6550
6551         if (dtrace_panicked != NULL)
6552                 return;
6553
6554         if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
6555                 return;
6556
6557         /*
6558          * We won the right to panic.  (We want to be sure that only one
6559          * thread calls panic() from dtrace_probe(), and that panic() is
6560          * called exactly once.)
6561          */
6562         panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6563             probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6564             probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6565
6566         /*
6567          * APPLE NOTE: this was for an old Mac OS X debug feature
6568          * allowing a return from panic().  Revisit someday.
6569          */
6570         dtrace_panicked = NULL;
6571 }
6572
6573 static void
6574 dtrace_action_raise(uint64_t sig)
6575 {
6576         if (dtrace_destructive_disallow)
6577                 return;
6578
6579         if (sig >= NSIG) {
6580                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6581                 return;
6582         }
6583
6584         /*
6585          * raise() has a queue depth of 1 -- we ignore all subsequent
6586          * invocations of the raise() action.
6587          */
6588
6589         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6590
6591         if (uthread && uthread->t_dtrace_sig == 0) {
6592                 uthread->t_dtrace_sig = sig;
6593                 act_set_astbsd(current_thread());
6594         }
6595 }
6596
6597 static void
6598 dtrace_action_stop(void)
6599 {
6600         if (dtrace_destructive_disallow)
6601                 return;
6602
6603         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6604         if (uthread) {
6605                 /*
6606                  * The currently running process will be set to task_suspend
6607                  * when it next leaves the kernel.
6608                 */
6609                 uthread->t_dtrace_stop = 1;
6610                 act_set_astbsd(current_thread());
6611         }
6612 }
6613
6614
6615 /*
6616  * APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
6617  * Both activate only when the currently running process next leaves the
6618  * kernel.
6619  */
6620 static void
6621 dtrace_action_pidresume(uint64_t pid)
6622 {
6623         if (dtrace_destructive_disallow)
6624                 return;
6625
6626         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
6627                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6628                 return;
6629         }
6630         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6631
6632         /*
6633          * When the currently running process leaves the kernel, it attempts to
6634          * task_resume the process (denoted by pid), if that pid appears to have
6635          * been stopped by dtrace_action_stop().
6636          * The currently running process has a pidresume() queue depth of 1 --
6637          * subsequent invocations of the pidresume() action are ignored.
6638          */
6639
6640         if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
6641                 uthread->t_dtrace_resumepid = pid;
6642                 act_set_astbsd(current_thread());
6643         }
6644 }
6645
6646 static void
6647 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6648 {
6649         hrtime_t now;
6650         volatile uint16_t *flags;
6651         dtrace_cpu_t *cpu = CPU;
6652
6653         if (dtrace_destructive_disallow)
6654                 return;
6655
6656         flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6657
6658         now = dtrace_gethrtime();
6659
6660         if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6661                 /*
6662                  * We need to advance the mark to the current time.
6663                  */
6664                 cpu->cpu_dtrace_chillmark = now;
6665                 cpu->cpu_dtrace_chilled = 0;
6666         }
6667
6668         /*
6669          * Now check to see if the requested chill time would take us over
6670          * the maximum amount of time allowed in the chill interval.  (Or
6671          * worse, if the calculation itself induces overflow.)
6672          */
6673         if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6674             cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6675                 *flags |= CPU_DTRACE_ILLOP;
6676                 return;
6677         }
6678
6679         while (dtrace_gethrtime() - now < val)
6680                 continue;
6681
6682         /*
6683          * Normally, we assure that the value of the variable "timestamp" does
6684          * not change within an ECB.  The presence of chill() represents an
6685          * exception to this rule, however.
6686          */
6687         mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6688         cpu->cpu_dtrace_chilled += val;
6689 }
6690
6691 static void
6692 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6693     uint64_t *buf, uint64_t arg)
6694 {
6695         int nframes = DTRACE_USTACK_NFRAMES(arg);
6696         int strsize = DTRACE_USTACK_STRSIZE(arg);
6697         uint64_t *pcs = &buf[1], *fps;
6698         char *str = (char *)&pcs[nframes];
6699         int size, offs = 0, i, j;
6700         uintptr_t old = mstate->dtms_scratch_ptr, saved;
6701         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6702         char *sym;
6703
6704         /*
6705          * Should be taking a faster path if string space has not been
6706          * allocated.
6707          */
6708         ASSERT(strsize != 0);
6709
6710         /*
6711          * We will first allocate some temporary space for the frame pointers.
6712          */
6713         fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6714         size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6715             (nframes * sizeof (uint64_t));
6716
6717         if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6718                 /*
6719                  * Not enough room for our frame pointers -- need to indicate
6720                  * that we ran out of scratch space.
6721                  */
6722                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6723                 return;
6724         }
6725
6726         mstate->dtms_scratch_ptr += size;
6727         saved = mstate->dtms_scratch_ptr;
6728
6729         /*
6730          * Now get a stack with both program counters and frame pointers.
6731          */
6732         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6733         dtrace_getufpstack(buf, fps, nframes + 1);
6734         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6735
6736         /*
6737          * If that faulted, we're cooked.
6738          */
6739         if (*flags & CPU_DTRACE_FAULT)
6740                 goto out;
6741
6742         /*
6743          * Now we want to walk up the stack, calling the USTACK helper.  For
6744          * each iteration, we restore the scratch pointer.
6745          */
6746         for (i = 0; i < nframes; i++) {
6747                 mstate->dtms_scratch_ptr = saved;
6748
6749                 if (offs >= strsize)
6750                         break;
6751
6752                 sym = (char *)(uintptr_t)dtrace_helper(
6753                     DTRACE_HELPER_ACTION_USTACK,
6754                     mstate, state, pcs[i], fps[i]);
6755
6756                 /*
6757                  * If we faulted while running the helper, we're going to
6758                  * clear the fault and null out the corresponding string.
6759                  */
6760                 if (*flags & CPU_DTRACE_FAULT) {
6761                         *flags &= ~CPU_DTRACE_FAULT;
6762                         str[offs++] = '\0';
6763                         continue;
6764                 }
6765
6766                 if (sym == NULL) {
6767                         str[offs++] = '\0';
6768                         continue;
6769                 }
6770
6771                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6772
6773                 /*
6774                  * Now copy in the string that the helper returned to us.
6775                  */
6776                 for (j = 0; offs + j < strsize; j++) {
6777                         if ((str[offs + j] = sym[j]) == '\0')
6778                                 break;
6779                 }
6780
6781                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6782
6783                 offs += j + 1;
6784         }
6785
6786         if (offs >= strsize) {
6787                 /*
6788                  * If we didn't have room for all of the strings, we don't
6789                  * abort processing -- this needn't be a fatal error -- but we
6790                  * still want to increment a counter (dts_stkstroverflows) to
6791                  * allow this condition to be warned about.  (If this is from
6792                  * a jstack() action, it is easily tuned via jstackstrsize.)
6793                  */
6794                 dtrace_error(&state->dts_stkstroverflows);
6795         }
6796
6797         while (offs < strsize)
6798                 str[offs++] = '\0';
6799
6800 out:
6801         mstate->dtms_scratch_ptr = old;
6802 }
6803
6804 static void
6805 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6806     size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6807 {
6808         volatile uint16_t *flags;
6809         uint64_t val = *valp;
6810         size_t valoffs = *valoffsp;
6811
6812         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6813         ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6814
6815         /*
6816          * If this is a string, we're going to only load until we find the zero
6817          * byte -- after which we'll store zero bytes.
6818          */
6819         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6820                 char c = '\0' + 1;
6821                 size_t s;
6822
6823                 for (s = 0; s < size; s++) {
6824                         if (c != '\0' && dtkind == DIF_TF_BYREF) {
6825                                 c = dtrace_load8(val++);
6826                         } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6827                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6828                                 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6829                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6830                                 if (*flags & CPU_DTRACE_FAULT)
6831                                         break;
6832                         }
6833
6834                         DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6835
6836                         if (c == '\0' && intuple)
6837                                 break;
6838                 }
6839         } else {
6840                 uint8_t c;
6841                 while (valoffs < end) {
6842                         if (dtkind == DIF_TF_BYREF) {
6843                                 c = dtrace_load8(val++);
6844                         } else if (dtkind == DIF_TF_BYUREF) {
6845                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6846                                 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6847                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6848                                 if (*flags & CPU_DTRACE_FAULT)
6849                                         break;
6850                         }
6851
6852                         DTRACE_STORE(uint8_t, tomax,
6853                             valoffs++, c);
6854                 }
6855         }
6856
6857         *valp = val;
6858         *valoffsp = valoffs;
6859 }
6860
6861 /*
6862  * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
6863  * defined, we also assert that we are not recursing unless the probe ID is an
6864  * error probe.
6865  */
6866 static dtrace_icookie_t
6867 dtrace_probe_enter(dtrace_id_t id)
6868 {
6869         thread_t thread = current_thread();
6870         uint16_t inprobe;
6871
6872         dtrace_icookie_t cookie;
6873
6874         cookie = dtrace_interrupt_disable();
6875
6876         /*
6877          * Unless this is an ERROR probe, we are not allowed to recurse in
6878          * dtrace_probe(). Recursing into DTrace probe usually means that a
6879          * function is instrumented that should not have been instrumented or
6880          * that the ordering guarantee of the records will be violated,
6881          * resulting in unexpected output. If there is an exception to this
6882          * assertion, a new case should be added.
6883          */
6884         inprobe = dtrace_get_thread_inprobe(thread);
6885         VERIFY(inprobe == 0 ||
6886             id == dtrace_probeid_error);
6887         ASSERT(inprobe < UINT16_MAX);
6888         dtrace_set_thread_inprobe(thread, inprobe + 1);
6889
6890         return (cookie);
6891 }
6892
6893 /*
6894  * Clears the per-thread inprobe flag and enables interrupts.
6895  */
6896 static void
6897 dtrace_probe_exit(dtrace_icookie_t cookie)
6898 {
6899         thread_t thread = current_thread();
6900         uint16_t inprobe = dtrace_get_thread_inprobe(thread);
6901
6902         ASSERT(inprobe > 0);
6903         dtrace_set_thread_inprobe(thread, inprobe - 1);
6904
6905 #if INTERRUPT_MASKED_DEBUG
6906         ml_spin_debug_reset(thread);
6907 #endif /* INTERRUPT_MASKED_DEBUG */
6908
6909         dtrace_interrupt_enable(cookie);
6910 }
6911
6912 /*
6913  * If you're looking for the epicenter of DTrace, you just found it.  This
6914  * is the function called by the provider to fire a probe -- from which all
6915  * subsequent probe-context DTrace activity emanates.
6916  */
6917 void
6918 dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6919     uint64_t arg2, uint64_t arg3, uint64_t arg4)
6920 {
6921         processorid_t cpuid;
6922         dtrace_icookie_t cookie;
6923         dtrace_probe_t *probe;
6924         dtrace_mstate_t mstate;
6925         dtrace_ecb_t *ecb;
6926         dtrace_action_t *act;
6927         intptr_t offs;
6928         size_t size;
6929         int vtime, onintr;
6930         volatile uint16_t *flags;
6931         hrtime_t now;
6932
6933         cookie = dtrace_probe_enter(id);
6934         probe = dtrace_probes[id - 1];
6935         cpuid = CPU->cpu_id;
6936         onintr = CPU_ON_INTR(CPU);
6937
6938         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6939             probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
6940                 /*
6941                  * We have hit in the predicate cache; we know that
6942                  * this predicate would evaluate to be false.
6943                  */
6944                 dtrace_probe_exit(cookie);
6945                 return;
6946         }
6947
6948         if (panic_quiesce) {
6949                 /*
6950                  * We don't trace anything if we're panicking.
6951                  */
6952                 dtrace_probe_exit(cookie);
6953                 return;
6954         }
6955
6956 #if !defined(__APPLE__)
6957         now = dtrace_gethrtime();
6958         vtime = dtrace_vtime_references != 0;
6959
6960         if (vtime && curthread->t_dtrace_start)
6961                 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6962 #else
6963         /*
6964          * APPLE NOTE:  The time spent entering DTrace and arriving
6965          * to this point, is attributed to the current thread.
6966          * Instead it should accrue to DTrace.  FIXME
6967          */
6968         vtime = dtrace_vtime_references != 0;
6969
6970         if (vtime)
6971         {
6972                 int64_t dtrace_accum_time, recent_vtime;
6973                 thread_t thread = current_thread();
6974
6975                 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
6976
6977                 if (dtrace_accum_time >= 0) {
6978                         recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
6979
6980                         recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
6981
6982                         dtrace_set_thread_vtime(thread, recent_vtime);
6983                 }
6984         }
6985
6986         now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
6987 #endif /* __APPLE__ */
6988
6989         /*
6990          * APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
6991          * dtrace_probe() in some circumstances.   See, e.g. fasttrap_isa.c.
6992          * However the provider has no access to ECB context, so passes
6993          * 0 through "arg0" and the probe_id of the overridden probe as arg1.
6994          * Detect that here and cons up a viable state (from the probe_id).
6995          */
6996         if (dtrace_probeid_error == id && 0 == arg0) {
6997                 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
6998                 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
6999                 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
7000
7001                 if (NULL != ftp_ecb) {
7002                         dtrace_state_t *ftp_state = ftp_ecb->dte_state;
7003
7004                         arg0 = (uint64_t)(uintptr_t)ftp_state;
7005                         arg1 = ftp_ecb->dte_epid;
7006                         /*
7007                          * args[2-4] established by caller.
7008                          */
7009                         ftp_state->dts_arg_error_illval = -1; /* arg5 */
7010                 }
7011         }
7012
7013         mstate.dtms_difo = NULL;
7014         mstate.dtms_probe = probe;
7015         mstate.dtms_strtok = 0;
7016         mstate.dtms_arg[0] = arg0;
7017         mstate.dtms_arg[1] = arg1;
7018         mstate.dtms_arg[2] = arg2;
7019         mstate.dtms_arg[3] = arg3;
7020         mstate.dtms_arg[4] = arg4;
7021
7022         flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7023
7024         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7025                 dtrace_predicate_t *pred = ecb->dte_predicate;
7026                 dtrace_state_t *state = ecb->dte_state;
7027                 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7028                 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7029                 dtrace_vstate_t *vstate = &state->dts_vstate;
7030                 dtrace_provider_t *prov = probe->dtpr_provider;
7031                 uint64_t tracememsize = 0;
7032                 int committed = 0;
7033                 caddr_t tomax;
7034
7035                 /*
7036                  * A little subtlety with the following (seemingly innocuous)
7037                  * declaration of the automatic 'val':  by looking at the
7038                  * code, you might think that it could be declared in the
7039                  * action processing loop, below.  (That is, it's only used in
7040                  * the action processing loop.)  However, it must be declared
7041                  * out of that scope because in the case of DIF expression
7042                  * arguments to aggregating actions, one iteration of the
7043                  * action loop will use the last iteration's value.
7044                  */
7045 #ifdef lint
7046                 uint64_t val = 0;
7047 #else
7048                 uint64_t val = 0;
7049 #endif
7050
7051                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7052                 *flags &= ~CPU_DTRACE_ERROR;
7053
7054                 if (prov == dtrace_provider) {
7055                         /*
7056                          * If dtrace itself is the provider of this probe,
7057                          * we're only going to continue processing the ECB if
7058                          * arg0 (the dtrace_state_t) is equal to the ECB's
7059                          * creating state.  (This prevents disjoint consumers
7060                          * from seeing one another's metaprobes.)
7061                          */
7062                         if (arg0 != (uint64_t)(uintptr_t)state)
7063                                 continue;
7064                 }
7065
7066                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7067                         /*
7068                          * We're not currently active.  If our provider isn't
7069                          * the dtrace pseudo provider, we're not interested.
7070                          */
7071                         if (prov != dtrace_provider)
7072                                 continue;
7073
7074                         /*
7075                          * Now we must further check if we are in the BEGIN
7076                          * probe.  If we are, we will only continue processing
7077                          * if we're still in WARMUP -- if one BEGIN enabling
7078                          * has invoked the exit() action, we don't want to
7079                          * evaluate subsequent BEGIN enablings.
7080                          */
7081                         if (probe->dtpr_id == dtrace_probeid_begin &&
7082                             state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7083                                 ASSERT(state->dts_activity ==
7084                                     DTRACE_ACTIVITY_DRAINING);
7085                                 continue;
7086                         }
7087                 }
7088
7089                 if (ecb->dte_cond) {
7090                         /*
7091                          * If the dte_cond bits indicate that this
7092                          * consumer is only allowed to see user-mode firings
7093                          * of this probe, call the provider's dtps_usermode()
7094                          * entry point to check that the probe was fired
7095                          * while in a user context. Skip this ECB if that's
7096                          * not the case.
7097                          */
7098                         if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7099                             prov->dtpv_pops.dtps_usermode &&
7100                             prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7101                             probe->dtpr_id, probe->dtpr_arg) == 0)
7102                                 continue;
7103
7104                         /*
7105                          * This is more subtle than it looks. We have to be
7106                          * absolutely certain that CRED() isn't going to
7107                          * change out from under us so it's only legit to
7108                          * examine that structure if we're in constrained
7109                          * situations. Currently, the only times we'll this
7110                          * check is if a non-super-user has enabled the
7111                          * profile or syscall providers -- providers that
7112                          * allow visibility of all processes. For the
7113                          * profile case, the check above will ensure that
7114                          * we're examining a user context.
7115                          */
7116                         if (ecb->dte_cond & DTRACE_COND_OWNER) {
7117                                 cred_t *cr;
7118                                 cred_t *s_cr =
7119                                     ecb->dte_state->dts_cred.dcr_cred;
7120                                 proc_t *proc;
7121 #pragma unused(proc) /* __APPLE__ */
7122
7123                                 ASSERT(s_cr != NULL);
7124
7125                         /*
7126                          * XXX this is hackish, but so is setting a variable
7127                          * XXX in a McCarthy OR...
7128                          */
7129                                 if ((cr = dtrace_CRED()) == NULL ||
7130                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
7131                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
7132                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
7133                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
7134                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
7135                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
7136 #if !defined(__APPLE__)
7137                                     (proc = ttoproc(curthread)) == NULL ||
7138                                     (proc->p_flag & SNOCD))
7139 #else
7140                                         1) /* APPLE NOTE: Darwin omits "No Core Dump" flag */
7141 #endif /* __APPLE__ */
7142                                         continue;
7143                         }
7144
7145                         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7146                                 cred_t *cr;
7147                                 cred_t *s_cr =
7148                                     ecb->dte_state->dts_cred.dcr_cred;
7149 #pragma unused(cr, s_cr) /* __APPLE__ */
7150
7151                                 ASSERT(s_cr != NULL);
7152
7153 #if !defined(__APPLE__)
7154                                 if ((cr = CRED()) == NULL ||
7155                                     s_cr->cr_zone->zone_id !=
7156                                     cr->cr_zone->zone_id)
7157                                         continue;
7158 #else
7159                                 /* APPLE NOTE: Darwin doesn't do zones. */
7160 #endif /* __APPLE__ */
7161                         }
7162                 }
7163
7164                 if (now - state->dts_alive > dtrace_deadman_timeout) {
7165                         /*
7166                          * We seem to be dead.  Unless we (a) have kernel
7167                          * destructive permissions (b) have expicitly enabled
7168                          * destructive actions and (c) destructive actions have
7169                          * not been disabled, we're going to transition into
7170                          * the KILLED state, from which no further processing
7171                          * on this state will be performed.
7172                          */
7173                         if (!dtrace_priv_kernel_destructive(state) ||
7174                             !state->dts_cred.dcr_destructive ||
7175                             dtrace_destructive_disallow) {
7176                                 void *activity = &state->dts_activity;
7177                                 dtrace_activity_t current;
7178
7179                                 do {
7180                                         current = state->dts_activity;
7181                                 } while (dtrace_cas32(activity, current,
7182                                     DTRACE_ACTIVITY_KILLED) != current);
7183
7184                                 continue;
7185                         }
7186                 }
7187
7188                 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7189                     ecb->dte_alignment, state, &mstate)) < 0)
7190                         continue;
7191
7192                 tomax = buf->dtb_tomax;
7193                 ASSERT(tomax != NULL);
7194
7195                 /*
7196                  * Build and store the record header corresponding to the ECB.
7197                  */
7198                 if (ecb->dte_size != 0) {
7199                         dtrace_rechdr_t dtrh;
7200
7201                         if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7202                                 mstate.dtms_timestamp = dtrace_gethrtime();
7203                                 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7204                         }
7205
7206                         ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7207
7208                         dtrh.dtrh_epid = ecb->dte_epid;
7209                         DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
7210                         DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
7211                 }
7212
7213                 mstate.dtms_epid = ecb->dte_epid;
7214                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
7215
7216                 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7217                         mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7218                 else
7219                         mstate.dtms_access = 0;
7220
7221                 if (pred != NULL) {
7222                         dtrace_difo_t *dp = pred->dtp_difo;
7223                         uint64_t rval;
7224
7225                         rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7226
7227                         if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7228                                 dtrace_cacheid_t cid = probe->dtpr_predcache;
7229
7230                                 if (cid != DTRACE_CACHEIDNONE && !onintr) {
7231                                         /*
7232                                          * Update the predicate cache...
7233                                          */
7234                                         ASSERT(cid == pred->dtp_cacheid);
7235
7236                                         dtrace_set_thread_predcache(current_thread(), cid);
7237                                 }
7238
7239                                 continue;
7240                         }
7241                 }
7242
7243                 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7244                     act != NULL; act = act->dta_next) {
7245                         size_t valoffs;
7246                         dtrace_difo_t *dp;
7247                         dtrace_recdesc_t *rec = &act->dta_rec;
7248
7249                         size = rec->dtrd_size;
7250                         valoffs = offs + rec->dtrd_offset;
7251
7252                         if (DTRACEACT_ISAGG(act->dta_kind)) {
7253                                 uint64_t v = 0xbad;
7254                                 dtrace_aggregation_t *agg;
7255
7256                                 agg = (dtrace_aggregation_t *)act;
7257
7258                                 if ((dp = act->dta_difo) != NULL)
7259                                         v = dtrace_dif_emulate(dp,
7260                                             &mstate, vstate, state);
7261
7262                                 if (*flags & CPU_DTRACE_ERROR)
7263                                         continue;
7264
7265                                 /*
7266                                  * Note that we always pass the expression
7267                                  * value from the previous iteration of the
7268                                  * action loop.  This value will only be used
7269                                  * if there is an expression argument to the
7270                                  * aggregating action, denoted by the
7271                                  * dtag_hasarg field.
7272                                  */
7273                                 dtrace_aggregate(agg, buf,
7274                                     offs, aggbuf, v, val);
7275                                 continue;
7276                         }
7277
7278                         switch (act->dta_kind) {
7279                         case DTRACEACT_STOP:
7280                                 if (dtrace_priv_proc_destructive(state))
7281                                         dtrace_action_stop();
7282                                 continue;
7283
7284                         case DTRACEACT_BREAKPOINT:
7285                                 if (dtrace_priv_kernel_destructive(state))
7286                                         dtrace_action_breakpoint(ecb);
7287                                 continue;
7288
7289                         case DTRACEACT_PANIC:
7290                                 if (dtrace_priv_kernel_destructive(state))
7291                                         dtrace_action_panic(ecb);
7292                                 continue;
7293
7294                         case DTRACEACT_STACK:
7295                                 if (!dtrace_priv_kernel(state))
7296                                         continue;
7297
7298                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
7299                                     size / sizeof (pc_t), probe->dtpr_aframes,
7300                                     DTRACE_ANCHORED(probe) ? NULL :
7301                                   (uint32_t *)(uintptr_t)arg0);
7302                                 continue;
7303
7304                         case DTRACEACT_JSTACK:
7305                         case DTRACEACT_USTACK:
7306                                 if (!dtrace_priv_proc(state))
7307                                         continue;
7308
7309                                 /*
7310                                  * See comment in DIF_VAR_PID.
7311                                  */
7312                                 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7313                                     CPU_ON_INTR(CPU)) {
7314                                         int depth = DTRACE_USTACK_NFRAMES(
7315                                             rec->dtrd_arg) + 1;
7316
7317                                         dtrace_bzero((void *)(tomax + valoffs),
7318                                             DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7319                                             + depth * sizeof (uint64_t));
7320
7321                                         continue;
7322                                 }
7323
7324                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7325                                     curproc->p_dtrace_helpers != NULL) {
7326                                         /*
7327                                          * This is the slow path -- we have
7328                                          * allocated string space, and we're
7329                                          * getting the stack of a process that
7330                                          * has helpers.  Call into a separate
7331                                          * routine to perform this processing.
7332                                          */
7333                                         dtrace_action_ustack(&mstate, state,
7334                                             (uint64_t *)(tomax + valoffs),
7335                                             rec->dtrd_arg);
7336                                         continue;
7337                                 }
7338
7339                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7340                                 dtrace_getupcstack((uint64_t *)
7341                                     (tomax + valoffs),
7342                                     DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7343                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7344                                 continue;
7345
7346                         default:
7347                                 break;
7348                         }
7349
7350                         dp = act->dta_difo;
7351                         ASSERT(dp != NULL);
7352
7353                         val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7354
7355                         if (*flags & CPU_DTRACE_ERROR)
7356                                 continue;
7357
7358                         switch (act->dta_kind) {
7359                         case DTRACEACT_SPECULATE: {
7360                                 dtrace_rechdr_t *dtrh = NULL;
7361
7362                                 ASSERT(buf == &state->dts_buffer[cpuid]);
7363                                 buf = dtrace_speculation_buffer(state,
7364                                     cpuid, val);
7365
7366                                 if (buf == NULL) {
7367                                         *flags |= CPU_DTRACE_DROP;
7368                                         continue;
7369                                 }
7370
7371                                 offs = dtrace_buffer_reserve(buf,
7372                                     ecb->dte_needed, ecb->dte_alignment,
7373                                     state, NULL);
7374
7375                                 if (offs < 0) {
7376                                         *flags |= CPU_DTRACE_DROP;
7377                                         continue;
7378                                 }
7379
7380                                 tomax = buf->dtb_tomax;
7381                                 ASSERT(tomax != NULL);
7382
7383                                 if (ecb->dte_size == 0)
7384                                         continue;
7385
7386                                 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7387                                 dtrh = ((void *)(tomax + offs));
7388                                 dtrh->dtrh_epid = ecb->dte_epid;
7389
7390                                 /*
7391                                  * When the speculation is committed, all of
7392                                  * the records in the speculative buffer will
7393                                  * have their timestamps set to the commit
7394                                  * time.  Until then, it is set to a sentinel
7395                                  * value, for debugability.
7396                                  */
7397                                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7398
7399                                 continue;
7400                         }
7401
7402                         case DTRACEACT_CHILL:
7403                                 if (dtrace_priv_kernel_destructive(state))
7404                                         dtrace_action_chill(&mstate, val);
7405                                 continue;
7406
7407                         case DTRACEACT_RAISE:
7408                                 if (dtrace_priv_proc_destructive(state))
7409                                         dtrace_action_raise(val);
7410                                 continue;
7411
7412                         case DTRACEACT_PIDRESUME:   /* __APPLE__ */
7413                                 if (dtrace_priv_proc_destructive(state))
7414                                         dtrace_action_pidresume(val);
7415                                 continue;
7416
7417                         case DTRACEACT_COMMIT:
7418                                 ASSERT(!committed);
7419
7420                                 /*
7421                                  * We need to commit our buffer state.
7422                                  */
7423                                 if (ecb->dte_size)
7424                                         buf->dtb_offset = offs + ecb->dte_size;
7425                                 buf = &state->dts_buffer[cpuid];
7426                                 dtrace_speculation_commit(state, cpuid, val);
7427                                 committed = 1;
7428                                 continue;
7429
7430                         case DTRACEACT_DISCARD:
7431                                 dtrace_speculation_discard(state, cpuid, val);
7432                                 continue;
7433
7434                         case DTRACEACT_DIFEXPR:
7435                         case DTRACEACT_LIBACT:
7436                         case DTRACEACT_PRINTF:
7437                         case DTRACEACT_PRINTA:
7438                         case DTRACEACT_SYSTEM:
7439                         case DTRACEACT_FREOPEN:
7440                         case DTRACEACT_APPLEBINARY:   /* __APPLE__ */
7441                         case DTRACEACT_TRACEMEM:
7442                                 break;
7443
7444                         case DTRACEACT_TRACEMEM_DYNSIZE:
7445                                 tracememsize = val;
7446                                 break;
7447
7448                         case DTRACEACT_SYM:
7449                         case DTRACEACT_MOD:
7450                                 if (!dtrace_priv_kernel(state))
7451                                         continue;
7452                                 break;
7453
7454                         case DTRACEACT_USYM:
7455                         case DTRACEACT_UMOD:
7456                         case DTRACEACT_UADDR: {
7457                                 if (!dtrace_priv_proc(state))
7458                                         continue;
7459
7460                                 DTRACE_STORE(uint64_t, tomax,
7461                                     valoffs, (uint64_t)dtrace_proc_selfpid());
7462                                 DTRACE_STORE(uint64_t, tomax,
7463                                     valoffs + sizeof (uint64_t), val);
7464
7465                                 continue;
7466                         }
7467
7468                         case DTRACEACT_EXIT: {
7469                                 /*
7470                                  * For the exit action, we are going to attempt
7471                                  * to atomically set our activity to be
7472                                  * draining.  If this fails (either because
7473                                  * another CPU has beat us to the exit action,
7474                                  * or because our current activity is something
7475                                  * other than ACTIVE or WARMUP), we will
7476                                  * continue.  This assures that the exit action
7477                                  * can be successfully recorded at most once
7478                                  * when we're in the ACTIVE state.  If we're
7479                                  * encountering the exit() action while in
7480                                  * COOLDOWN, however, we want to honor the new
7481                                  * status code.  (We know that we're the only
7482                                  * thread in COOLDOWN, so there is no race.)
7483                                  */
7484                                 void *activity = &state->dts_activity;
7485                                 dtrace_activity_t current = state->dts_activity;
7486
7487                                 if (current == DTRACE_ACTIVITY_COOLDOWN)
7488                                         break;
7489
7490                                 if (current != DTRACE_ACTIVITY_WARMUP)
7491                                         current = DTRACE_ACTIVITY_ACTIVE;
7492
7493                                 if (dtrace_cas32(activity, current,
7494                                     DTRACE_ACTIVITY_DRAINING) != current) {
7495                                         *flags |= CPU_DTRACE_DROP;
7496                                         continue;
7497                                 }
7498
7499                                 break;
7500                         }
7501
7502                         default:
7503                                 ASSERT(0);
7504                         }
7505
7506                         if (dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF)) {
7507                                 uintptr_t end = valoffs + size;
7508
7509                                 if (tracememsize != 0 &&
7510                                     valoffs + tracememsize < end)
7511                                 {
7512                                         end = valoffs + tracememsize;
7513                                         tracememsize = 0;
7514                                 }
7515
7516                                 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7517                                     !dtrace_vcanload((void *)(uintptr_t)val,
7518                                     &dp->dtdo_rtype, NULL, &mstate, vstate))
7519                                 {
7520                                         continue;
7521                                 }
7522
7523                                 dtrace_store_by_ref(dp, tomax, size, &valoffs,
7524                                     &val, end, act->dta_intuple,
7525                                     dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7526                                     DIF_TF_BYREF: DIF_TF_BYUREF);
7527
7528                                 continue;
7529                         }
7530
7531                         switch (size) {
7532                         case 0:
7533                                 break;
7534
7535                         case sizeof (uint8_t):
7536                                 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7537                                 break;
7538                         case sizeof (uint16_t):
7539                                 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7540                                 break;
7541                         case sizeof (uint32_t):
7542                                 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7543                                 break;
7544                         case sizeof (uint64_t):
7545                                 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7546                                 break;
7547                         default:
7548                                 /*
7549                                  * Any other size should have been returned by
7550                                  * reference, not by value.
7551                                  */
7552                                 ASSERT(0);
7553                                 break;
7554                         }
7555                 }
7556
7557                 if (*flags & CPU_DTRACE_DROP)
7558                         continue;
7559
7560                 if (*flags & CPU_DTRACE_FAULT) {
7561                         int ndx;
7562                         dtrace_action_t *err;
7563
7564                         buf->dtb_errors++;
7565
7566                         if (probe->dtpr_id == dtrace_probeid_error) {
7567                                 /*
7568                                  * There's nothing we can do -- we had an
7569                                  * error on the error probe.  We bump an
7570                                  * error counter to at least indicate that
7571                                  * this condition happened.
7572                                  */
7573                                 dtrace_error(&state->dts_dblerrors);
7574                                 continue;
7575                         }
7576
7577                         if (vtime) {
7578                                 /*
7579                                  * Before recursing on dtrace_probe(), we
7580                                  * need to explicitly clear out our start
7581                                  * time to prevent it from being accumulated
7582                                  * into t_dtrace_vtime.
7583                                  */
7584
7585                                 /*
7586                                  * Darwin sets the sign bit on t_dtrace_tracing
7587                                  * to suspend accumulation to it.
7588                                  */
7589                                 dtrace_set_thread_tracing(current_thread(),
7590                                     (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
7591
7592                         }
7593
7594                         /*
7595                          * Iterate over the actions to figure out which action
7596                          * we were processing when we experienced the error.
7597                          * Note that act points _past_ the faulting action; if
7598                          * act is ecb->dte_action, the fault was in the
7599                          * predicate, if it's ecb->dte_action->dta_next it's
7600                          * in action #1, and so on.
7601                          */
7602                         for (err = ecb->dte_action, ndx = 0;
7603                             err != act; err = err->dta_next, ndx++)
7604                                 continue;
7605
7606                         dtrace_probe_error(state, ecb->dte_epid, ndx,
7607                             (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7608                             mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7609                             cpu_core[cpuid].cpuc_dtrace_illval);
7610
7611                         continue;
7612                 }
7613
7614                 if (!committed)
7615                         buf->dtb_offset = offs + ecb->dte_size;
7616         }
7617
7618         /* FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed
7619            to the current thread. Instead it should accrue to DTrace. */
7620         if (vtime) {
7621                 thread_t thread = current_thread();
7622                 int64_t t = dtrace_get_thread_tracing(thread);
7623
7624                 if (t >= 0) {
7625                         /* Usual case, accumulate time spent here into t_dtrace_tracing */
7626                         dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
7627                 } else {
7628                         /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
7629                         dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
7630                 }
7631         }
7632
7633         dtrace_probe_exit(cookie);
7634 }
7635
7636 /*
7637  * DTrace Probe Hashing Functions
7638  *
7639  * The functions in this section (and indeed, the functions in remaining
7640  * sections) are not _called_ from probe context.  (Any exceptions to this are
7641  * marked with a "Note:".)  Rather, they are called from elsewhere in the
7642  * DTrace framework to look-up probes in, add probes to and remove probes from
7643  * the DTrace probe hashes.  (Each probe is hashed by each element of the
7644  * probe tuple -- allowing for fast lookups, regardless of what was
7645  * specified.)
7646  */
7647 static uint_t
7648 dtrace_hash_str(const char *p)
7649 {
7650         unsigned int g;
7651         uint_t hval = 0;
7652
7653         while (*p) {
7654                 hval = (hval << 4) + *p++;
7655                 if ((g = (hval & 0xf0000000)) != 0)
7656                         hval ^= g >> 24;
7657                 hval &= ~g;
7658         }
7659         return (hval);
7660 }
7661
7662 static const char*
7663 dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
7664 {
7665 #pragma unused(offs)
7666         dtrace_probe_t *probe = (dtrace_probe_t*)elm;
7667         return probe->dtpr_provider->dtpv_name;
7668 }
7669
7670 static const char*
7671 dtrace_strkey_offset(void *elm, uintptr_t offs)
7672 {
7673         return ((char *)((uintptr_t)(elm) + offs));
7674 }
7675
7676 static const char*
7677 dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
7678 {
7679         return *((char **)((uintptr_t)(elm) + offs));
7680 }
7681
7682 static dtrace_hash_t *
7683 dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
7684 {
7685         dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7686
7687         hash->dth_getstr = func;
7688         hash->dth_stroffs = arg;
7689         hash->dth_nextoffs = nextoffs;
7690         hash->dth_prevoffs = prevoffs;
7691
7692         hash->dth_size = 1;
7693         hash->dth_mask = hash->dth_size - 1;
7694
7695         hash->dth_tab = kmem_zalloc(hash->dth_size *
7696             sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7697
7698         return (hash);
7699 }
7700
7701 /*
7702  * APPLE NOTE: dtrace_hash_destroy is not used.
7703  * It is called by dtrace_detach which is not
7704  * currently implemented.  Revisit someday.
7705  */
7706 #if !defined(__APPLE__)
7707 static void
7708 dtrace_hash_destroy(dtrace_hash_t *hash)
7709 {
7710 #if DEBUG
7711         int i;
7712
7713         for (i = 0; i < hash->dth_size; i++)
7714                 ASSERT(hash->dth_tab[i] == NULL);
7715 #endif
7716
7717         kmem_free(hash->dth_tab,
7718             hash->dth_size * sizeof (dtrace_hashbucket_t *));
7719         kmem_free(hash, sizeof (dtrace_hash_t));
7720 }
7721 #endif /* __APPLE__ */
7722
7723 static void
7724 dtrace_hash_resize(dtrace_hash_t *hash)
7725 {
7726         int size = hash->dth_size, i, ndx;
7727         int new_size = hash->dth_size << 1;
7728         int new_mask = new_size - 1;
7729         dtrace_hashbucket_t **new_tab, *bucket, *next;
7730
7731         ASSERT((new_size & new_mask) == 0);
7732
7733         new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7734
7735         for (i = 0; i < size; i++) {
7736                 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7737                         void *elm = bucket->dthb_chain;
7738
7739                         ASSERT(elm != NULL);
7740                         ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
7741
7742                         next = bucket->dthb_next;
7743                         bucket->dthb_next = new_tab[ndx];
7744                         new_tab[ndx] = bucket;
7745                 }
7746         }
7747
7748         kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7749         hash->dth_tab = new_tab;
7750         hash->dth_size = new_size;
7751         hash->dth_mask = new_mask;
7752 }
7753
7754 static void
7755 dtrace_hash_add(dtrace_hash_t *hash, void *new)
7756 {
7757         int hashval = DTRACE_HASHSTR(hash, new);
7758         int ndx = hashval & hash->dth_mask;
7759         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7760         void **nextp, **prevp;
7761
7762         for (; bucket != NULL; bucket = bucket->dthb_next) {
7763                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7764                         goto add;
7765         }
7766
7767         if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7768                 dtrace_hash_resize(hash);
7769                 dtrace_hash_add(hash, new);
7770                 return;
7771         }
7772
7773         bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7774         bucket->dthb_next = hash->dth_tab[ndx];
7775         hash->dth_tab[ndx] = bucket;
7776         hash->dth_nbuckets++;
7777
7778 add:
7779         nextp = DTRACE_HASHNEXT(hash, new);
7780         ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7781         *nextp = bucket->dthb_chain;
7782
7783         if (bucket->dthb_chain != NULL) {
7784                 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7785                 ASSERT(*prevp == NULL);
7786                 *prevp = new;
7787         }
7788
7789         bucket->dthb_chain = new;
7790         bucket->dthb_len++;
7791 }
7792
7793 static void *
7794 dtrace_hash_lookup_string(dtrace_hash_t *hash, const char *str)
7795 {
7796         int hashval = dtrace_hash_str(str);
7797         int ndx = hashval & hash->dth_mask;
7798         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7799
7800         for (; bucket != NULL; bucket = bucket->dthb_next) {
7801                 if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == 0)
7802                         return (bucket->dthb_chain);
7803         }
7804
7805         return (NULL);
7806 }
7807
7808 static dtrace_probe_t *
7809 dtrace_hash_lookup(dtrace_hash_t *hash, void *template)
7810 {
7811         return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
7812 }
7813
7814 static int
7815 dtrace_hash_collisions(dtrace_hash_t *hash, void *template)
7816 {
7817         int hashval = DTRACE_HASHSTR(hash, template);
7818         int ndx = hashval & hash->dth_mask;
7819         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7820
7821         for (; bucket != NULL; bucket = bucket->dthb_next) {
7822                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7823                         return (bucket->dthb_len);
7824         }
7825
7826         return (0);
7827 }
7828
7829 static void
7830 dtrace_hash_remove(dtrace_hash_t *hash, void *elm)
7831 {
7832         int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
7833         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7834
7835         void **prevp = DTRACE_HASHPREV(hash, elm);
7836         void **nextp = DTRACE_HASHNEXT(hash, elm);
7837
7838         /*
7839          * Find the bucket that we're removing this elm from.
7840          */
7841         for (; bucket != NULL; bucket = bucket->dthb_next) {
7842                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
7843                         break;
7844         }
7845
7846         ASSERT(bucket != NULL);
7847
7848         if (*prevp == NULL) {
7849                 if (*nextp == NULL) {
7850                         /*
7851                          * The removed element was the only element on this
7852                          * bucket; we need to remove the bucket.
7853                          */
7854                         dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7855
7856                         ASSERT(bucket->dthb_chain == elm);
7857                         ASSERT(b != NULL);
7858
7859                         if (b == bucket) {
7860                                 hash->dth_tab[ndx] = bucket->dthb_next;
7861                         } else {
7862                                 while (b->dthb_next != bucket)
7863                                         b = b->dthb_next;
7864                                 b->dthb_next = bucket->dthb_next;
7865                         }
7866
7867                         ASSERT(hash->dth_nbuckets > 0);
7868                         hash->dth_nbuckets--;
7869                         kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7870                         return;
7871                 }
7872
7873                 bucket->dthb_chain = *nextp;
7874         } else {
7875                 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7876         }
7877
7878         if (*nextp != NULL)
7879                 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7880 }
7881
7882 /*
7883  * DTrace Utility Functions
7884  *
7885  * These are random utility functions that are _not_ called from probe context.
7886  */
7887 static int
7888 dtrace_badattr(const dtrace_attribute_t *a)
7889 {
7890         return (a->dtat_name > DTRACE_STABILITY_MAX ||
7891             a->dtat_data > DTRACE_STABILITY_MAX ||
7892             a->dtat_class > DTRACE_CLASS_MAX);
7893 }
7894
7895 /*
7896  * Returns a dtrace-managed copy of a string, and will
7897  * deduplicate copies of the same string.
7898  * If the specified string is NULL, returns an empty string
7899  */
7900 static char *
7901 dtrace_strref(const char *str)
7902 {
7903         dtrace_string_t *s = NULL;
7904         size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
7905
7906         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7907
7908         if (str == NULL)
7909                 str = "";
7910
7911         for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7912              s = *(DTRACE_HASHNEXT(dtrace_strings, s)))  {
7913                 if (strncmp(str, s->dtst_str, bufsize) != 0) {
7914                         continue;
7915                 }
7916                 ASSERT(s->dtst_refcount != UINT32_MAX);
7917                 s->dtst_refcount++;
7918                 return s->dtst_str;
7919         }
7920
7921         s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
7922         s->dtst_refcount = 1;
7923         (void) strlcpy(s->dtst_str, str, bufsize);
7924
7925         dtrace_hash_add(dtrace_strings, s);
7926
7927         return s->dtst_str;
7928 }
7929
7930 static void
7931 dtrace_strunref(const char *str)
7932 {
7933         ASSERT(str != NULL);
7934         dtrace_string_t *s = NULL;
7935         size_t bufsize = strlen(str) + 1;
7936
7937         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7938
7939         for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7940              s = *(DTRACE_HASHNEXT(dtrace_strings, s)))  {
7941                 if (strncmp(str, s->dtst_str, bufsize) != 0) {
7942                         continue;
7943                 }
7944                 ASSERT(s->dtst_refcount != 0);
7945                 s->dtst_refcount--;
7946                 if (s->dtst_refcount == 0) {
7947                         dtrace_hash_remove(dtrace_strings, s);
7948                         kmem_free(s, sizeof(dtrace_string_t) + bufsize);
7949                 }
7950                 return;
7951         }
7952         panic("attempt to unref non-existent string %s", str);
7953 }
7954
7955 #define DTRACE_ISALPHA(c)       \
7956         (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7957
7958 static int
7959 dtrace_badname(const char *s)
7960 {
7961         char c;
7962
7963         if (s == NULL || (c = *s++) == '\0')
7964                 return (0);
7965
7966         if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7967                 return (1);
7968
7969         while ((c = *s++) != '\0') {
7970                 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7971                     c != '-' && c != '_' && c != '.' && c != '`')
7972                         return (1);
7973         }
7974
7975         return (0);
7976 }
7977
7978 static void
7979 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7980 {
7981         uint32_t priv;
7982
7983         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7984                 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
7985                         priv = DTRACE_PRIV_USER | DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER;
7986                 }
7987                 else {
7988                         priv = DTRACE_PRIV_ALL;
7989                 }
7990                 *uidp = 0;
7991                 *zoneidp = 0;
7992         } else {
7993                 *uidp = crgetuid(cr);
7994                 *zoneidp = crgetzoneid(cr);
7995
7996                 priv = 0;
7997                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7998                         priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7999                 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8000                         priv |= DTRACE_PRIV_USER;
8001                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8002                         priv |= DTRACE_PRIV_PROC;
8003                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8004                         priv |= DTRACE_PRIV_OWNER;
8005                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8006                         priv |= DTRACE_PRIV_ZONEOWNER;
8007         }
8008
8009         *privp = priv;
8010 }
8011
8012 #ifdef DTRACE_ERRDEBUG
8013 static void
8014 dtrace_errdebug(const char *str)
8015 {
8016         int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8017         int occupied = 0;
8018
8019         lck_mtx_lock(&dtrace_errlock);
8020         dtrace_errlast = str;
8021         dtrace_errthread = (kthread_t *)current_thread();
8022
8023         while (occupied++ < DTRACE_ERRHASHSZ) {
8024                 if (dtrace_errhash[hval].dter_msg == str) {
8025                         dtrace_errhash[hval].dter_count++;
8026                         goto out;
8027                 }
8028
8029                 if (dtrace_errhash[hval].dter_msg != NULL) {
8030                         hval = (hval + 1) % DTRACE_ERRHASHSZ;
8031                         continue;
8032                 }
8033
8034                 dtrace_errhash[hval].dter_msg = str;
8035                 dtrace_errhash[hval].dter_count = 1;
8036                 goto out;
8037         }
8038
8039         panic("dtrace: undersized error hash");
8040 out:
8041         lck_mtx_unlock(&dtrace_errlock);
8042 }
8043 #endif
8044
8045 /*
8046  * DTrace Matching Functions
8047  *
8048  * These functions are used to match groups of probes, given some elements of
8049  * a probe tuple, or some globbed expressions for elements of a probe tuple.
8050  */
8051 static int
8052 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8053     zoneid_t zoneid)
8054 {
8055         if (priv != DTRACE_PRIV_ALL) {
8056                 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8057                 uint32_t match = priv & ppriv;
8058
8059                 /*
8060                  * No PRIV_DTRACE_* privileges...
8061                  */
8062                 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8063                     DTRACE_PRIV_KERNEL)) == 0)
8064                         return (0);
8065
8066                 /*
8067                  * No matching bits, but there were bits to match...
8068                  */
8069                 if (match == 0 && ppriv != 0)
8070                         return (0);
8071
8072                 /*
8073                  * Need to have permissions to the process, but don't...
8074                  */
8075                 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8076                     uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8077                         return (0);
8078                 }
8079
8080                 /*
8081                  * Need to be in the same zone unless we possess the
8082                  * privilege to examine all zones.
8083                  */
8084                 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8085                     zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8086                         return (0);
8087                 }
8088         }
8089
8090         return (1);
8091 }
8092
8093 /*
8094  * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8095  * consists of input pattern strings and an ops-vector to evaluate them.
8096  * This function returns >0 for match, 0 for no match, and <0 for error.
8097  */
8098 static int
8099 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8100     uint32_t priv, uid_t uid, zoneid_t zoneid)
8101 {
8102         dtrace_provider_t *pvp = prp->dtpr_provider;
8103         int rv;
8104
8105         if (pvp->dtpv_defunct)
8106                 return (0);
8107
8108         if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8109                 return (rv);
8110
8111         if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8112                 return (rv);
8113
8114         if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8115                 return (rv);
8116
8117         if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8118                 return (rv);
8119
8120         if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8121                 return (0);
8122
8123         return (rv);
8124 }
8125
8126 /*
8127  * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8128  * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
8129  * libc's version, the kernel version only applies to 8-bit ASCII strings.
8130  * In addition, all of the recursion cases except for '*' matching have been
8131  * unwound.  For '*', we still implement recursive evaluation, but a depth
8132  * counter is maintained and matching is aborted if we recurse too deep.
8133  * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8134  */
8135 static int
8136 dtrace_match_glob(const char *s, const char *p, int depth)
8137 {
8138         const char *olds;
8139         char s1, c;
8140         int gs;
8141
8142         if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8143                 return (-1);
8144
8145         if (s == NULL)
8146                 s = ""; /* treat NULL as empty string */
8147
8148 top:
8149         olds = s;
8150         s1 = *s++;
8151
8152         if (p == NULL)
8153                 return (0);
8154
8155         if ((c = *p++) == '\0')
8156                 return (s1 == '\0');
8157
8158         switch (c) {
8159         case '[': {
8160                 int ok = 0, notflag = 0;
8161                 char lc = '\0';
8162
8163                 if (s1 == '\0')
8164                         return (0);
8165
8166                 if (*p == '!') {
8167                         notflag = 1;
8168                         p++;
8169                 }
8170
8171                 if ((c = *p++) == '\0')
8172                         return (0);
8173
8174                 do {
8175                         if (c == '-' && lc != '\0' && *p != ']') {
8176                                 if ((c = *p++) == '\0')
8177                                         return (0);
8178                                 if (c == '\\' && (c = *p++) == '\0')
8179                                         return (0);
8180
8181                                 if (notflag) {
8182                                         if (s1 < lc || s1 > c)
8183                                                 ok++;
8184                                         else
8185                                                 return (0);
8186                                 } else if (lc <= s1 && s1 <= c)
8187                                         ok++;
8188
8189                         } else if (c == '\\' && (c = *p++) == '\0')
8190                                 return (0);
8191
8192                         lc = c; /* save left-hand 'c' for next iteration */
8193
8194                         if (notflag) {
8195                                 if (s1 != c)
8196                                         ok++;
8197                                 else
8198                                         return (0);
8199                         } else if (s1 == c)
8200                                 ok++;
8201
8202                         if ((c = *p++) == '\0')
8203                                 return (0);
8204
8205                 } while (c != ']');
8206
8207                 if (ok)
8208                         goto top;
8209
8210                 return (0);
8211         }
8212
8213         case '\\':
8214                 if ((c = *p++) == '\0')
8215                         return (0);
8216                 /*FALLTHRU*/
8217
8218         default:
8219                 if (c != s1)
8220                         return (0);
8221                 /*FALLTHRU*/
8222
8223         case '?':
8224                 if (s1 != '\0')
8225                         goto top;
8226                 return (0);
8227
8228         case '*':
8229                 while (*p == '*')
8230                         p++; /* consecutive *'s are identical to a single one */
8231
8232                 if (*p == '\0')
8233                         return (1);
8234
8235                 for (s = olds; *s != '\0'; s++) {
8236                         if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8237                                 return (gs);
8238                 }
8239
8240                 return (0);
8241         }
8242 }
8243
8244 /*ARGSUSED*/
8245 static int
8246 dtrace_match_string(const char *s, const char *p, int depth)
8247 {
8248 #pragma unused(depth) /* __APPLE__ */
8249         return (s != NULL && s == p);
8250 }
8251
8252 /*ARGSUSED*/
8253 static int
8254 dtrace_match_module(const char *s, const char *p, int depth)
8255 {
8256 #pragma unused(depth) /* __APPLE__ */
8257         size_t len;
8258         if (s == NULL || p == NULL)
8259                 return (0);
8260
8261         len = strlen(p);
8262
8263         if (strncmp(p, s, len) != 0)
8264                 return (0);
8265
8266         if (s[len] == '.' || s[len] == '\0')
8267                 return (1);
8268
8269         return (0);
8270 }
8271
8272 /*ARGSUSED*/
8273 static int
8274 dtrace_match_nul(const char *s, const char *p, int depth)
8275 {
8276 #pragma unused(s, p, depth) /* __APPLE__ */
8277         return (1); /* always match the empty pattern */
8278 }
8279
8280 /*ARGSUSED*/
8281 static int
8282 dtrace_match_nonzero(const char *s, const char *p, int depth)
8283 {
8284 #pragma unused(p, depth) /* __APPLE__ */
8285         return (s != NULL && s[0] != '\0');
8286 }
8287
8288 static int
8289 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8290     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *, void *), void *arg1, void *arg2)
8291 {
8292         dtrace_probe_t *probe;
8293         dtrace_provider_t prov_template = {
8294                 .dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
8295         };
8296
8297         dtrace_probe_t template = {
8298                 .dtpr_provider = &prov_template,
8299                 .dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
8300                 .dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
8301                 .dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
8302         };
8303
8304         dtrace_hash_t *hash = NULL;
8305         int len, rc, best = INT_MAX, nmatched = 0;
8306         dtrace_id_t i;
8307
8308         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8309
8310         /*
8311          * If the probe ID is specified in the key, just lookup by ID and
8312          * invoke the match callback once if a matching probe is found.
8313          */
8314         if (pkp->dtpk_id != DTRACE_IDNONE) {
8315                 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8316                     dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8317                         if ((*matched)(probe, arg1, arg2) == DTRACE_MATCH_FAIL)
8318                                return (DTRACE_MATCH_FAIL);
8319                         nmatched++;
8320                 }
8321                 return (nmatched);
8322         }
8323
8324         /*
8325          * We want to find the most distinct of the provider name, module name,
8326          * function name, and name.  So for each one that is not a glob
8327          * pattern or empty string, we perform a lookup in the corresponding
8328          * hash and use the hash table with the fewest collisions to do our
8329          * search.
8330          */
8331         if (pkp->dtpk_pmatch == &dtrace_match_string &&
8332             (len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) {
8333                 best = len;
8334                 hash = dtrace_byprov;
8335         }
8336
8337         if (pkp->dtpk_mmatch == &dtrace_match_string &&
8338             (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8339                 best = len;
8340                 hash = dtrace_bymod;
8341         }
8342
8343         if (pkp->dtpk_fmatch == &dtrace_match_string &&
8344             (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8345                 best = len;
8346                 hash = dtrace_byfunc;
8347         }
8348
8349         if (pkp->dtpk_nmatch == &dtrace_match_string &&
8350             (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8351                 best = len;
8352                 hash = dtrace_byname;
8353         }
8354
8355         /*
8356          * If we did not select a hash table, iterate over every probe and
8357          * invoke our callback for each one that matches our input probe key.
8358          */
8359         if (hash == NULL) {
8360                 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
8361                         if ((probe = dtrace_probes[i]) == NULL ||
8362                             dtrace_match_probe(probe, pkp, priv, uid,
8363                             zoneid) <= 0)
8364                                 continue;
8365
8366                         nmatched++;
8367
8368                        if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8369                                if (rc == DTRACE_MATCH_FAIL)
8370                                        return (DTRACE_MATCH_FAIL);
8371                                break;
8372                        }
8373                 }
8374
8375                 return (nmatched);
8376         }
8377
8378         /*
8379          * If we selected a hash table, iterate over each probe of the same key
8380          * name and invoke the callback for every probe that matches the other
8381          * attributes of our input probe key.
8382          */
8383         for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8384             probe = *(DTRACE_HASHNEXT(hash, probe))) {
8385
8386                 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8387                         continue;
8388
8389                 nmatched++;
8390
8391                 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8392                     if (rc == DTRACE_MATCH_FAIL)
8393                         return (DTRACE_MATCH_FAIL);
8394                     break;
8395                 }
8396         }
8397
8398         return (nmatched);
8399 }
8400
8401 /*
8402  * Return the function pointer dtrace_probecmp() should use to compare the
8403  * specified pattern with a string.  For NULL or empty patterns, we select
8404  * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
8405  * For non-empty non-glob strings, we use dtrace_match_string().
8406  */
8407 static dtrace_probekey_f *
8408 dtrace_probekey_func(const char *p)
8409 {
8410         char c;
8411
8412         if (p == NULL || *p == '\0')
8413                 return (&dtrace_match_nul);
8414
8415         while ((c = *p++) != '\0') {
8416                 if (c == '[' || c == '?' || c == '*' || c == '\\')
8417                         return (&dtrace_match_glob);
8418         }
8419
8420         return (&dtrace_match_string);
8421 }
8422
8423 static dtrace_probekey_f *
8424 dtrace_probekey_module_func(const char *p)
8425 {
8426         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8427
8428         dtrace_probekey_f *f = dtrace_probekey_func(p);
8429         if (f == &dtrace_match_string) {
8430                 dtrace_probe_t template = {
8431                         .dtpr_mod = (char *)(uintptr_t)p,
8432                 };
8433                 if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) {
8434                         return (&dtrace_match_module);
8435                 }
8436                 return (&dtrace_match_string);
8437         }
8438         return f;
8439 }
8440
8441 /*
8442  * Build a probe comparison key for use with dtrace_match_probe() from the
8443  * given probe description.  By convention, a null key only matches anchored
8444  * probes: if each field is the empty string, reset dtpk_fmatch to
8445  * dtrace_match_nonzero().
8446  */
8447 static void
8448 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8449 {
8450
8451         pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider);
8452         pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8453
8454         pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod);
8455         pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod);
8456
8457         pkp->dtpk_func = dtrace_strref(pdp->dtpd_func);
8458         pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8459
8460         pkp->dtpk_name = dtrace_strref(pdp->dtpd_name);
8461         pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8462
8463         pkp->dtpk_id = pdp->dtpd_id;
8464
8465         if (pkp->dtpk_id == DTRACE_IDNONE &&
8466             pkp->dtpk_pmatch == &dtrace_match_nul &&
8467             pkp->dtpk_mmatch == &dtrace_match_nul &&
8468             pkp->dtpk_fmatch == &dtrace_match_nul &&
8469             pkp->dtpk_nmatch == &dtrace_match_nul)
8470                 pkp->dtpk_fmatch = &dtrace_match_nonzero;
8471 }
8472
8473 static void
8474 dtrace_probekey_release(dtrace_probekey_t *pkp)
8475 {
8476         dtrace_strunref(pkp->dtpk_prov);
8477         dtrace_strunref(pkp->dtpk_mod);
8478         dtrace_strunref(pkp->dtpk_func);
8479         dtrace_strunref(pkp->dtpk_name);
8480 }
8481
8482 static int
8483 dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data)
8484 {
8485         if (desc == NULL)
8486                 return 1;
8487
8488         dtrace_probekey_f *func = dtrace_probekey_func(desc->dtpd_provider);
8489
8490         return func((char*)data, desc->dtpd_provider, 0);
8491 }
8492
8493 /*
8494  * DTrace Provider-to-Framework API Functions
8495  *
8496  * These functions implement much of the Provider-to-Framework API, as
8497  * described in <sys/dtrace.h>.  The parts of the API not in this section are
8498  * the functions in the API for probe management (found below), and
8499  * dtrace_probe() itself (found above).
8500  */
8501
8502 /*
8503  * Register the calling provider with the DTrace framework.  This should
8504  * generally be called by DTrace providers in their attach(9E) entry point.
8505  */
8506 int
8507 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8508     cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8509 {
8510         dtrace_provider_t *provider;
8511
8512         if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8513                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8514                     "arguments", name ? name : "<NULL>");
8515                 return (EINVAL);
8516         }
8517
8518         if (name[0] == '\0' || dtrace_badname(name)) {
8519                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8520                     "provider name", name);
8521                 return (EINVAL);
8522         }
8523
8524         if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8525             pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8526             pops->dtps_destroy == NULL ||
8527             ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8528                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8529                     "provider ops", name);
8530                 return (EINVAL);
8531         }
8532
8533         if (dtrace_badattr(&pap->dtpa_provider) ||
8534             dtrace_badattr(&pap->dtpa_mod) ||
8535             dtrace_badattr(&pap->dtpa_func) ||
8536             dtrace_badattr(&pap->dtpa_name) ||
8537             dtrace_badattr(&pap->dtpa_args)) {
8538                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8539                     "provider attributes", name);
8540                 return (EINVAL);
8541         }
8542
8543         if (priv & ~DTRACE_PRIV_ALL) {
8544                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8545                     "privilege attributes", name);
8546                 return (EINVAL);
8547         }
8548
8549         if ((priv & DTRACE_PRIV_KERNEL) &&
8550             (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8551             pops->dtps_usermode == NULL) {
8552                 cmn_err(CE_WARN, "failed to register provider '%s': need "
8553                     "dtps_usermode() op for given privilege attributes", name);
8554                 return (EINVAL);
8555         }
8556
8557         provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8558
8559         provider->dtpv_attr = *pap;
8560         provider->dtpv_priv.dtpp_flags = priv;
8561         if (cr != NULL) {
8562                 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8563                 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8564         }
8565         provider->dtpv_pops = *pops;
8566
8567         if (pops->dtps_provide == NULL) {
8568                 ASSERT(pops->dtps_provide_module != NULL);
8569                 provider->dtpv_pops.dtps_provide = dtrace_provide_nullop;
8570         }
8571
8572         if (pops->dtps_provide_module == NULL) {
8573                 ASSERT(pops->dtps_provide != NULL);
8574                 provider->dtpv_pops.dtps_provide_module =
8575                     dtrace_provide_module_nullop;
8576         }
8577
8578         if (pops->dtps_suspend == NULL) {
8579                 ASSERT(pops->dtps_resume == NULL);
8580                 provider->dtpv_pops.dtps_suspend = dtrace_suspend_nullop;
8581                 provider->dtpv_pops.dtps_resume = dtrace_resume_nullop;
8582         }
8583
8584         provider->dtpv_arg = arg;
8585         *idp = (dtrace_provider_id_t)provider;
8586
8587         if (pops == &dtrace_provider_ops) {
8588                 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8589                 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8590
8591                 provider->dtpv_name = dtrace_strref(name);
8592
8593                 ASSERT(dtrace_anon.dta_enabling == NULL);
8594
8595                 /*
8596                  * We make sure that the DTrace provider is at the head of
8597                  * the provider chain.
8598                  */
8599                 provider->dtpv_next = dtrace_provider;
8600                 dtrace_provider = provider;
8601                 return (0);
8602         }
8603
8604         lck_mtx_lock(&dtrace_provider_lock);
8605         lck_mtx_lock(&dtrace_lock);
8606
8607         provider->dtpv_name = dtrace_strref(name);
8608
8609         /*
8610          * If there is at least one provider registered, we'll add this
8611          * provider after the first provider.
8612          */
8613         if (dtrace_provider != NULL) {
8614                 provider->dtpv_next = dtrace_provider->dtpv_next;
8615                 dtrace_provider->dtpv_next = provider;
8616         } else {
8617                 dtrace_provider = provider;
8618         }
8619
8620         if (dtrace_retained != NULL) {
8621                 dtrace_enabling_provide(provider);
8622
8623                 /*
8624                  * Now we need to call dtrace_enabling_matchall_with_cond() --
8625                  * with a condition matching the provider name we just added,
8626                  * which will acquire cpu_lock and dtrace_lock.  We therefore need
8627                  * to drop all of our locks before calling into it...
8628                  */
8629                 lck_mtx_unlock(&dtrace_lock);
8630                 lck_mtx_unlock(&dtrace_provider_lock);
8631
8632                 dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name};
8633                 dtrace_enabling_matchall_with_cond(&cond);
8634
8635                 return (0);
8636         }
8637
8638         lck_mtx_unlock(&dtrace_lock);
8639         lck_mtx_unlock(&dtrace_provider_lock);
8640
8641         return (0);
8642 }
8643
8644 /*
8645  * Unregister the specified provider from the DTrace framework.  This should
8646  * generally be called by DTrace providers in their detach(9E) entry point.
8647  */
8648 int
8649 dtrace_unregister(dtrace_provider_id_t id)
8650 {
8651         dtrace_provider_t *old = (dtrace_provider_t *)id;
8652         dtrace_provider_t *prev = NULL;
8653         int self = 0;
8654         dtrace_probe_t *probe, *first = NULL, *next = NULL;
8655         dtrace_probe_t template = {
8656                 .dtpr_provider = old
8657         };
8658
8659         if (old->dtpv_pops.dtps_enable ==
8660             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
8661                 /*
8662                  * If DTrace itself is the provider, we're called with locks
8663                  * already held.
8664                  */
8665                 ASSERT(old == dtrace_provider);
8666                 ASSERT(dtrace_devi != NULL);
8667                 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8668                 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8669                 self = 1;
8670
8671                 if (dtrace_provider->dtpv_next != NULL) {
8672                         /*
8673                          * There's another provider here; return failure.
8674                          */
8675                         return (EBUSY);
8676                 }
8677         } else {
8678                 lck_mtx_lock(&dtrace_provider_lock);
8679                 lck_mtx_lock(&mod_lock);
8680                 lck_mtx_lock(&dtrace_lock);
8681         }
8682
8683         /*
8684          * If anyone has /dev/dtrace open, or if there are anonymous enabled
8685          * probes, we refuse to let providers slither away, unless this
8686          * provider has already been explicitly invalidated.
8687          */
8688         if (!old->dtpv_defunct &&
8689             (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8690             dtrace_anon.dta_state->dts_necbs > 0))) {
8691                 if (!self) {
8692                         lck_mtx_unlock(&dtrace_lock);
8693                         lck_mtx_unlock(&mod_lock);
8694                         lck_mtx_unlock(&dtrace_provider_lock);
8695                 }
8696                 return (EBUSY);
8697         }
8698
8699         /*
8700          * Attempt to destroy the probes associated with this provider.
8701          */
8702         if (old->dtpv_ecb_count!=0) {
8703                 /*
8704                  * We have at least one ECB; we can't remove this provider.
8705                  */
8706                 if (!self) {
8707                         lck_mtx_unlock(&dtrace_lock);
8708                         lck_mtx_unlock(&mod_lock);
8709                         lck_mtx_unlock(&dtrace_provider_lock);
8710                 }
8711                 return (EBUSY);
8712         }
8713
8714         /*
8715          * All of the probes for this provider are disabled; we can safely
8716          * remove all of them from their hash chains and from the probe array.
8717          */
8718         for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8719             probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8720                 if (probe->dtpr_provider != old)
8721                         continue;
8722
8723                 dtrace_probes[probe->dtpr_id - 1] = NULL;
8724                 old->dtpv_probe_count--;
8725
8726                 dtrace_hash_remove(dtrace_bymod, probe);
8727                 dtrace_hash_remove(dtrace_byfunc, probe);
8728                 dtrace_hash_remove(dtrace_byname, probe);
8729
8730                 if (first == NULL) {
8731                         first = probe;
8732                         probe->dtpr_nextmod = NULL;
8733                 } else {
8734                         /*
8735                          * Use nextmod as the chain of probes to remove
8736                          */
8737                         probe->dtpr_nextmod = first;
8738                         first = probe;
8739                 }
8740         }
8741
8742         for (probe = first; probe != NULL; probe = next) {
8743                 next = probe->dtpr_nextmod;
8744                 dtrace_hash_remove(dtrace_byprov, probe);
8745         }
8746
8747         /*
8748          * The provider's probes have been removed from the hash chains and
8749          * from the probe array.  Now issue a dtrace_sync() to be sure that
8750          * everyone has cleared out from any probe array processing.
8751          */
8752         dtrace_sync();
8753
8754         for (probe = first; probe != NULL; probe = next) {
8755                 next = probe->dtpr_nextmod;
8756
8757                 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8758                     probe->dtpr_arg);
8759                 dtrace_strunref(probe->dtpr_mod);
8760                 dtrace_strunref(probe->dtpr_func);
8761                 dtrace_strunref(probe->dtpr_name);
8762                 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8763                 zfree(dtrace_probe_t_zone, probe);
8764         }
8765
8766         if ((prev = dtrace_provider) == old) {
8767                 ASSERT(self || dtrace_devi == NULL);
8768                 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8769                 dtrace_provider = old->dtpv_next;
8770         } else {
8771                 while (prev != NULL && prev->dtpv_next != old)
8772                         prev = prev->dtpv_next;
8773
8774                 if (prev == NULL) {
8775                         panic("attempt to unregister non-existent "
8776                             "dtrace provider %p\n", (void *)id);
8777                 }
8778
8779                 prev->dtpv_next = old->dtpv_next;
8780         }
8781
8782         dtrace_strunref(old->dtpv_name);
8783
8784         if (!self) {
8785                 lck_mtx_unlock(&dtrace_lock);
8786                 lck_mtx_unlock(&mod_lock);
8787                 lck_mtx_unlock(&dtrace_provider_lock);
8788         }
8789
8790         kmem_free(old, sizeof (dtrace_provider_t));
8791
8792         return (0);
8793 }
8794
8795 /*
8796  * Invalidate the specified provider.  All subsequent probe lookups for the
8797  * specified provider will fail, but its probes will not be removed.
8798  */
8799 void
8800 dtrace_invalidate(dtrace_provider_id_t id)
8801 {
8802         dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8803
8804         ASSERT(pvp->dtpv_pops.dtps_enable !=
8805             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8806
8807         lck_mtx_lock(&dtrace_provider_lock);
8808         lck_mtx_lock(&dtrace_lock);
8809
8810         pvp->dtpv_defunct = 1;
8811
8812         lck_mtx_unlock(&dtrace_lock);
8813         lck_mtx_unlock(&dtrace_provider_lock);
8814 }
8815
8816 /*
8817  * Indicate whether or not DTrace has attached.
8818  */
8819 int
8820 dtrace_attached(void)
8821 {
8822         /*
8823          * dtrace_provider will be non-NULL iff the DTrace driver has
8824          * attached.  (It's non-NULL because DTrace is always itself a
8825          * provider.)
8826          */
8827         return (dtrace_provider != NULL);
8828 }
8829
8830 /*
8831  * Remove all the unenabled probes for the given provider.  This function is
8832  * not unlike dtrace_unregister(), except that it doesn't remove the provider
8833  * -- just as many of its associated probes as it can.
8834  */
8835 int
8836 dtrace_condense(dtrace_provider_id_t id)
8837 {
8838         dtrace_provider_t *prov = (dtrace_provider_t *)id;
8839         dtrace_probe_t *probe, *first = NULL;
8840         dtrace_probe_t template = {
8841                 .dtpr_provider = prov
8842         };
8843
8844         /*
8845          * Make sure this isn't the dtrace provider itself.
8846          */
8847         ASSERT(prov->dtpv_pops.dtps_enable !=
8848           (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8849
8850         lck_mtx_lock(&dtrace_provider_lock);
8851         lck_mtx_lock(&dtrace_lock);
8852
8853         /*
8854          * Attempt to destroy the probes associated with this provider.
8855          */
8856         for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8857             probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8858
8859                 if (probe->dtpr_provider != prov)
8860                         continue;
8861
8862                 if (probe->dtpr_ecb != NULL)
8863                         continue;
8864
8865                 dtrace_probes[probe->dtpr_id - 1] = NULL;
8866                 prov->dtpv_probe_count--;
8867
8868                 dtrace_hash_remove(dtrace_bymod, probe);
8869                 dtrace_hash_remove(dtrace_byfunc, probe);
8870                 dtrace_hash_remove(dtrace_byname, probe);
8871
8872                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
8873                     probe->dtpr_arg);
8874                 dtrace_strunref(probe->dtpr_mod);
8875                 dtrace_strunref(probe->dtpr_func);
8876                 dtrace_strunref(probe->dtpr_name);
8877                 if (first == NULL) {
8878                         first = probe;
8879                         probe->dtpr_nextmod = NULL;
8880                 } else {
8881                         /*
8882                          * Use nextmod as the chain of probes to remove
8883                          */
8884                         probe->dtpr_nextmod = first;
8885                         first = probe;
8886                 }
8887         }
8888
8889         for (probe = first; probe != NULL; probe = first) {
8890                 first = probe->dtpr_nextmod;
8891                 dtrace_hash_remove(dtrace_byprov, probe);
8892                 vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), 1);
8893                 zfree(dtrace_probe_t_zone, probe);
8894         }
8895
8896         lck_mtx_unlock(&dtrace_lock);
8897         lck_mtx_unlock(&dtrace_provider_lock);
8898
8899         return (0);
8900 }
8901
8902 /*
8903  * DTrace Probe Management Functions
8904  *
8905  * The functions in this section perform the DTrace probe management,
8906  * including functions to create probes, look-up probes, and call into the
8907  * providers to request that probes be provided.  Some of these functions are
8908  * in the Provider-to-Framework API; these functions can be identified by the
8909  * fact that they are not declared "static".
8910  */
8911
8912 /*
8913  * Create a probe with the specified module name, function name, and name.
8914  */
8915 dtrace_id_t
8916 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8917     const char *func, const char *name, int aframes, void *arg)
8918 {
8919         dtrace_probe_t *probe, **probes;
8920         dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8921         dtrace_id_t id;
8922
8923         if (provider == dtrace_provider) {
8924                 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8925         } else {
8926                 lck_mtx_lock(&dtrace_lock);
8927         }
8928
8929         id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8930             VM_BESTFIT | VM_SLEEP);
8931
8932         probe = zalloc(dtrace_probe_t_zone);
8933         bzero(probe, sizeof (dtrace_probe_t));
8934
8935         probe->dtpr_id = id;
8936         probe->dtpr_gen = dtrace_probegen++;
8937         probe->dtpr_mod = dtrace_strref(mod);
8938         probe->dtpr_func = dtrace_strref(func);
8939         probe->dtpr_name = dtrace_strref(name);
8940         probe->dtpr_arg = arg;
8941         probe->dtpr_aframes = aframes;
8942         probe->dtpr_provider = provider;
8943
8944         dtrace_hash_add(dtrace_byprov, probe);
8945         dtrace_hash_add(dtrace_bymod, probe);
8946         dtrace_hash_add(dtrace_byfunc, probe);
8947         dtrace_hash_add(dtrace_byname, probe);
8948
8949         if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
8950                 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8951                 size_t nsize = osize * 2;
8952
8953                 probes = kmem_zalloc(nsize, KM_SLEEP);
8954
8955                 dtrace_probe_t **oprobes = dtrace_probes;
8956
8957                 bcopy(oprobes, probes, osize);
8958                 dtrace_membar_producer();
8959                 dtrace_probes = probes;
8960
8961                 dtrace_sync();
8962
8963                 /*
8964                  * All CPUs are now seeing the new probes array; we can
8965                  * safely free the old array.
8966                  */
8967                 kmem_free(oprobes, osize);
8968                 dtrace_nprobes *= 2;
8969
8970                 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
8971         }
8972
8973         ASSERT(dtrace_probes[id - 1] == NULL);
8974         dtrace_probes[id - 1] = probe;
8975         provider->dtpv_probe_count++;
8976
8977         if (provider != dtrace_provider)
8978                 lck_mtx_unlock(&dtrace_lock);
8979
8980         return (id);
8981 }
8982
8983 static dtrace_probe_t *
8984 dtrace_probe_lookup_id(dtrace_id_t id)
8985 {
8986         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8987
8988         if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
8989                 return (NULL);
8990
8991         return (dtrace_probes[id - 1]);
8992 }
8993
8994 static int
8995 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg1, void *arg2)
8996 {
8997 #pragma unused(arg2)
8998         *((dtrace_id_t *)arg1) = probe->dtpr_id;
8999
9000         return (DTRACE_MATCH_DONE);
9001 }
9002
9003 /*
9004  * Look up a probe based on provider and one or more of module name, function
9005  * name and probe name.
9006  */
9007 dtrace_id_t
9008 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
9009     const char *func, const char *name)
9010 {
9011         dtrace_probekey_t pkey;
9012         dtrace_id_t id;
9013         int match;
9014
9015         lck_mtx_lock(&dtrace_lock);
9016
9017         pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name);
9018         pkey.dtpk_pmatch = &dtrace_match_string;
9019         pkey.dtpk_mod = dtrace_strref(mod);
9020         pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9021         pkey.dtpk_func = dtrace_strref(func);
9022         pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9023         pkey.dtpk_name = dtrace_strref(name);
9024         pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9025         pkey.dtpk_id = DTRACE_IDNONE;
9026
9027         match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9028             dtrace_probe_lookup_match, &id, NULL);
9029
9030         dtrace_probekey_release(&pkey);
9031
9032         lck_mtx_unlock(&dtrace_lock);
9033
9034         ASSERT(match == 1 || match == 0);
9035         return (match ? id : 0);
9036 }
9037
9038 /*
9039  * Returns the probe argument associated with the specified probe.
9040  */
9041 void *
9042 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9043 {
9044         dtrace_probe_t *probe;
9045         void *rval = NULL;
9046
9047         lck_mtx_lock(&dtrace_lock);
9048
9049         if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9050             probe->dtpr_provider == (dtrace_provider_t *)id)
9051                 rval = probe->dtpr_arg;
9052
9053         lck_mtx_unlock(&dtrace_lock);
9054
9055         return (rval);
9056 }
9057
9058 /*
9059  * Copy a probe into a probe description.
9060  */
9061 static void
9062 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9063 {
9064         bzero(pdp, sizeof (dtrace_probedesc_t));
9065         pdp->dtpd_id = prp->dtpr_id;
9066
9067         /* APPLE NOTE: Darwin employs size bounded string operation. */
9068         (void) strlcpy(pdp->dtpd_provider,
9069             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
9070
9071         (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
9072         (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
9073         (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
9074 }
9075
9076 /*
9077  * Called to indicate that a probe -- or probes -- should be provided by a
9078  * specfied provider.  If the specified description is NULL, the provider will
9079  * be told to provide all of its probes.  (This is done whenever a new
9080  * consumer comes along, or whenever a retained enabling is to be matched.) If
9081  * the specified description is non-NULL, the provider is given the
9082  * opportunity to dynamically provide the specified probe, allowing providers
9083  * to support the creation of probes on-the-fly.  (So-called _autocreated_
9084  * probes.)  If the provider is NULL, the operations will be applied to all
9085  * providers; if the provider is non-NULL the operations will only be applied
9086  * to the specified provider.  The dtrace_provider_lock must be held, and the
9087  * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9088  * will need to grab the dtrace_lock when it reenters the framework through
9089  * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9090  */
9091 static void
9092 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9093 {
9094         struct modctl *ctl;
9095         int all = 0;
9096
9097         LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
9098
9099         if (prv == NULL) {
9100                 all = 1;
9101                 prv = dtrace_provider;
9102         }
9103
9104         do {
9105                 /*
9106                  * First, call the blanket provide operation.
9107                  */
9108                 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9109
9110                 /*
9111                  * Now call the per-module provide operation.  We will grab
9112                  * mod_lock to prevent the list from being modified.  Note
9113                  * that this also prevents the mod_busy bits from changing.
9114                  * (mod_busy can only be changed with mod_lock held.)
9115                  */
9116                 lck_mtx_lock(&mod_lock);
9117
9118                 ctl = dtrace_modctl_list;
9119                 while (ctl) {
9120                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9121                         ctl = ctl->mod_next;
9122                 }
9123
9124                 lck_mtx_unlock(&mod_lock);
9125         } while (all && (prv = prv->dtpv_next) != NULL);
9126 }
9127
9128 /*
9129  * Iterate over each probe, and call the Framework-to-Provider API function
9130  * denoted by offs.
9131  */
9132 static void
9133 dtrace_probe_foreach(uintptr_t offs)
9134 {
9135         dtrace_provider_t *prov;
9136         void (*func)(void *, dtrace_id_t, void *);
9137         dtrace_probe_t *probe;
9138         dtrace_icookie_t cookie;
9139         int i;
9140
9141         /*
9142          * We disable interrupts to walk through the probe array.  This is
9143          * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9144          * won't see stale data.
9145          */
9146         cookie = dtrace_interrupt_disable();
9147
9148         for (i = 0; i < dtrace_nprobes; i++) {
9149                 if ((probe = dtrace_probes[i]) == NULL)
9150                         continue;
9151
9152                 if (probe->dtpr_ecb == NULL) {
9153                         /*
9154                          * This probe isn't enabled -- don't call the function.
9155                          */
9156                         continue;
9157                 }
9158
9159                 prov = probe->dtpr_provider;
9160                 func = *((void(**)(void *, dtrace_id_t, void *))
9161                     ((uintptr_t)&prov->dtpv_pops + offs));
9162
9163                 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9164         }
9165
9166         dtrace_interrupt_enable(cookie);
9167 }
9168
9169 static int
9170 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtrace_ecbdesc_t *ep)
9171 {
9172         dtrace_probekey_t pkey;
9173         uint32_t priv;
9174         uid_t uid;
9175         zoneid_t zoneid;
9176         int err;
9177
9178         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9179
9180         dtrace_ecb_create_cache = NULL;
9181
9182         if (desc == NULL) {
9183                 /*
9184                  * If we're passed a NULL description, we're being asked to
9185                  * create an ECB with a NULL probe.
9186                  */
9187                 (void) dtrace_ecb_create_enable(NULL, enab, ep);
9188                 return (0);
9189         }
9190
9191         dtrace_probekey(desc, &pkey);
9192         dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9193             &priv, &uid, &zoneid);
9194
9195         err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep);
9196
9197         dtrace_probekey_release(&pkey);
9198
9199         return err;
9200 }
9201
9202 /*
9203  * DTrace Helper Provider Functions
9204  */
9205 static void
9206 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9207 {
9208         attr->dtat_name = DOF_ATTR_NAME(dofattr);
9209         attr->dtat_data = DOF_ATTR_DATA(dofattr);
9210         attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9211 }
9212
9213 static void
9214 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9215     const dof_provider_t *dofprov, char *strtab)
9216 {
9217         hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9218         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9219             dofprov->dofpv_provattr);
9220         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9221             dofprov->dofpv_modattr);
9222         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9223             dofprov->dofpv_funcattr);
9224         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9225             dofprov->dofpv_nameattr);
9226         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9227             dofprov->dofpv_argsattr);
9228 }
9229
9230 static void
9231 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9232 {
9233         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9234         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9235         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9236         dof_provider_t *provider;
9237         dof_probe_t *probe;
9238         uint32_t *off, *enoff;
9239         uint8_t *arg;
9240         char *strtab;
9241         uint_t i, nprobes;
9242         dtrace_helper_provdesc_t dhpv;
9243         dtrace_helper_probedesc_t dhpb;
9244         dtrace_meta_t *meta = dtrace_meta_pid;
9245         dtrace_mops_t *mops = &meta->dtm_mops;
9246         void *parg;
9247
9248         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9249         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9250             provider->dofpv_strtab * dof->dofh_secsize);
9251         prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9252             provider->dofpv_probes * dof->dofh_secsize);
9253         arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9254             provider->dofpv_prargs * dof->dofh_secsize);
9255         off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9256             provider->dofpv_proffs * dof->dofh_secsize);
9257
9258         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9259         off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9260         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9261         enoff = NULL;
9262
9263         /*
9264          * See dtrace_helper_provider_validate().
9265          */
9266         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9267             provider->dofpv_prenoffs != DOF_SECT_NONE) {
9268                 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9269                     provider->dofpv_prenoffs * dof->dofh_secsize);
9270                 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9271         }
9272
9273         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9274
9275         /*
9276          * Create the provider.
9277          */
9278         dtrace_dofprov2hprov(&dhpv, provider, strtab);
9279
9280         if ((parg = mops->dtms_provide_proc(meta->dtm_arg, &dhpv, p)) == NULL)
9281                 return;
9282
9283         meta->dtm_count++;
9284
9285         /*
9286          * Create the probes.
9287          */
9288         for (i = 0; i < nprobes; i++) {
9289                 probe = (dof_probe_t *)(uintptr_t)(daddr +
9290                     prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9291
9292                 dhpb.dthpb_mod = dhp->dofhp_mod;
9293                 dhpb.dthpb_func = strtab + probe->dofpr_func;
9294                 dhpb.dthpb_name = strtab + probe->dofpr_name;
9295 #if !defined(__APPLE__)
9296                 dhpb.dthpb_base = probe->dofpr_addr;
9297 #else
9298                 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
9299 #endif
9300                 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
9301                 dhpb.dthpb_noffs = probe->dofpr_noffs;
9302                 if (enoff != NULL) {
9303                         dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
9304                         dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9305                 } else {
9306                         dhpb.dthpb_enoffs = NULL;
9307                         dhpb.dthpb_nenoffs = 0;
9308                 }
9309                 dhpb.dthpb_args = arg + probe->dofpr_argidx;
9310                 dhpb.dthpb_nargc = probe->dofpr_nargc;
9311                 dhpb.dthpb_xargc = probe->dofpr_xargc;
9312                 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9313                 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9314
9315                 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9316         }
9317
9318         /*
9319          * Since we just created probes, we need to match our enablings
9320          * against those, with a precondition knowing that we have only
9321          * added probes from this provider
9322          */
9323         char *prov_name = mops->dtms_provider_name(parg);
9324         ASSERT(prov_name != NULL);
9325         dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name};
9326
9327         dtrace_enabling_matchall_with_cond(&cond);
9328 }
9329
9330 static void
9331 dtrace_helper_provide(dof_helper_t *dhp, proc_t *p)
9332 {
9333         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9334         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9335         uint32_t i;
9336
9337         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9338
9339         for (i = 0; i < dof->dofh_secnum; i++) {
9340                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9341                     dof->dofh_secoff + i * dof->dofh_secsize);
9342
9343                 if (sec->dofs_type != DOF_SECT_PROVIDER)
9344                         continue;
9345
9346                 dtrace_helper_provide_one(dhp, sec, p);
9347         }
9348 }
9349
9350 static void
9351 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9352 {
9353         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9354         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9355         dof_sec_t *str_sec;
9356         dof_provider_t *provider;
9357         char *strtab;
9358         dtrace_helper_provdesc_t dhpv;
9359         dtrace_meta_t *meta = dtrace_meta_pid;
9360         dtrace_mops_t *mops = &meta->dtm_mops;
9361
9362         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9363         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9364             provider->dofpv_strtab * dof->dofh_secsize);
9365
9366         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9367
9368         /*
9369          * Create the provider.
9370          */
9371         dtrace_dofprov2hprov(&dhpv, provider, strtab);
9372
9373         mops->dtms_remove_proc(meta->dtm_arg, &dhpv, p);
9374
9375         meta->dtm_count--;
9376 }
9377
9378 static void
9379 dtrace_helper_provider_remove(dof_helper_t *dhp, proc_t *p)
9380 {
9381         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9382         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9383         uint32_t i;
9384
9385         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9386
9387         for (i = 0; i < dof->dofh_secnum; i++) {
9388                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9389                     dof->dofh_secoff + i * dof->dofh_secsize);
9390
9391                 if (sec->dofs_type != DOF_SECT_PROVIDER)
9392                         continue;
9393
9394                 dtrace_helper_provider_remove_one(dhp, sec, p);
9395         }
9396 }
9397
9398 /*
9399  * DTrace Meta Provider-to-Framework API Functions
9400  *
9401  * These functions implement the Meta Provider-to-Framework API, as described
9402  * in <sys/dtrace.h>.
9403  */
9404 int
9405 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9406     dtrace_meta_provider_id_t *idp)
9407 {
9408         dtrace_meta_t *meta;
9409         dtrace_helpers_t *help, *next;
9410         uint_t i;
9411
9412         *idp = DTRACE_METAPROVNONE;
9413
9414         /*
9415          * We strictly don't need the name, but we hold onto it for
9416          * debuggability. All hail error queues!
9417          */
9418         if (name == NULL) {
9419                 cmn_err(CE_WARN, "failed to register meta-provider: "
9420                     "invalid name");
9421                 return (EINVAL);
9422         }
9423
9424         if (mops == NULL ||
9425             mops->dtms_create_probe == NULL ||
9426             mops->dtms_provide_proc == NULL ||
9427             mops->dtms_remove_proc == NULL) {
9428                 cmn_err(CE_WARN, "failed to register meta-register %s: "
9429                     "invalid ops", name);
9430                 return (EINVAL);
9431         }
9432
9433         meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9434         meta->dtm_mops = *mops;
9435         meta->dtm_arg = arg;
9436
9437         lck_mtx_lock(&dtrace_meta_lock);
9438         lck_mtx_lock(&dtrace_lock);
9439
9440         if (dtrace_meta_pid != NULL) {
9441                 lck_mtx_unlock(&dtrace_lock);
9442                 lck_mtx_unlock(&dtrace_meta_lock);
9443                 cmn_err(CE_WARN, "failed to register meta-register %s: "
9444                     "user-land meta-provider exists", name);
9445                 kmem_free(meta, sizeof (dtrace_meta_t));
9446                 return (EINVAL);
9447         }
9448
9449         meta->dtm_name = dtrace_strref(name);
9450
9451         dtrace_meta_pid = meta;
9452         *idp = (dtrace_meta_provider_id_t)meta;
9453
9454         /*
9455          * If there are providers and probes ready to go, pass them
9456          * off to the new meta provider now.
9457          */
9458
9459         help = dtrace_deferred_pid;
9460         dtrace_deferred_pid = NULL;
9461
9462         lck_mtx_unlock(&dtrace_lock);
9463
9464         while (help != NULL) {
9465                 for (i = 0; i < help->dthps_nprovs; i++) {
9466                         proc_t *p = proc_find(help->dthps_pid);
9467                         if (p == PROC_NULL)
9468                                 continue;
9469                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9470                             p);
9471                         proc_rele(p);
9472                 }
9473
9474                 next = help->dthps_next;
9475                 help->dthps_next = NULL;
9476                 help->dthps_prev = NULL;
9477                 help->dthps_deferred = 0;
9478                 help = next;
9479         }
9480
9481         lck_mtx_unlock(&dtrace_meta_lock);
9482
9483         return (0);
9484 }
9485
9486 int
9487 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9488 {
9489         dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9490
9491         lck_mtx_lock(&dtrace_meta_lock);
9492         lck_mtx_lock(&dtrace_lock);
9493
9494         if (old == dtrace_meta_pid) {
9495                 pp = &dtrace_meta_pid;
9496         } else {
9497                 panic("attempt to unregister non-existent "
9498                     "dtrace meta-provider %p\n", (void *)old);
9499         }
9500
9501         if (old->dtm_count != 0) {
9502                 lck_mtx_unlock(&dtrace_lock);
9503                 lck_mtx_unlock(&dtrace_meta_lock);
9504                 return (EBUSY);
9505         }
9506
9507         *pp = NULL;
9508
9509         dtrace_strunref(old->dtm_name);
9510
9511         lck_mtx_unlock(&dtrace_lock);
9512         lck_mtx_unlock(&dtrace_meta_lock);
9513
9514         kmem_free(old, sizeof (dtrace_meta_t));
9515
9516         return (0);
9517 }
9518
9519
9520 /*
9521  * DTrace DIF Object Functions
9522  */
9523 static int
9524 dtrace_difo_err(uint_t pc, const char *format, ...)
9525 {
9526         if (dtrace_err_verbose) {
9527                 va_list alist;
9528
9529                 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9530                 va_start(alist, format);
9531                 (void) vuprintf(format, alist);
9532                 va_end(alist);
9533         }
9534
9535 #ifdef DTRACE_ERRDEBUG
9536         dtrace_errdebug(format);
9537 #endif
9538         return (1);
9539 }
9540
9541 /*
9542  * Validate a DTrace DIF object by checking the IR instructions.  The following
9543  * rules are currently enforced by dtrace_difo_validate():
9544  *
9545  * 1. Each instruction must have a valid opcode
9546  * 2. Each register, string, variable, or subroutine reference must be valid
9547  * 3. No instruction can modify register %r0 (must be zero)
9548  * 4. All instruction reserved bits must be set to zero
9549  * 5. The last instruction must be a "ret" instruction
9550  * 6. All branch targets must reference a valid instruction _after_ the branch
9551  */
9552 static int
9553 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9554     cred_t *cr)
9555 {
9556         int err = 0;
9557         uint_t i;
9558
9559         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9560         int kcheckload;
9561         uint_t pc;
9562         int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
9563
9564         kcheckload = cr == NULL ||
9565             (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9566
9567         dp->dtdo_destructive = 0;
9568
9569         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9570                 dif_instr_t instr = dp->dtdo_buf[pc];
9571
9572                 uint_t r1 = DIF_INSTR_R1(instr);
9573                 uint_t r2 = DIF_INSTR_R2(instr);
9574                 uint_t rd = DIF_INSTR_RD(instr);
9575                 uint_t rs = DIF_INSTR_RS(instr);
9576                 uint_t label = DIF_INSTR_LABEL(instr);
9577                 uint_t v = DIF_INSTR_VAR(instr);
9578                 uint_t subr = DIF_INSTR_SUBR(instr);
9579                 uint_t type = DIF_INSTR_TYPE(instr);
9580                 uint_t op = DIF_INSTR_OP(instr);
9581
9582                 switch (op) {
9583                 case DIF_OP_OR:
9584                 case DIF_OP_XOR:
9585                 case DIF_OP_AND:
9586                 case DIF_OP_SLL:
9587                 case DIF_OP_SRL:
9588                 case DIF_OP_SRA:
9589                 case DIF_OP_SUB:
9590                 case DIF_OP_ADD:
9591                 case DIF_OP_MUL:
9592                 case DIF_OP_SDIV:
9593                 case DIF_OP_UDIV:
9594                 case DIF_OP_SREM:
9595                 case DIF_OP_UREM:
9596                 case DIF_OP_COPYS:
9597                         if (r1 >= nregs)
9598                                 err += efunc(pc, "invalid register %u\n", r1);
9599                         if (r2 >= nregs)
9600                                 err += efunc(pc, "invalid register %u\n", r2);
9601                         if (rd >= nregs)
9602                                 err += efunc(pc, "invalid register %u\n", rd);
9603                         if (rd == 0)
9604                                 err += efunc(pc, "cannot write to %%r0\n");
9605                         break;
9606                 case DIF_OP_NOT:
9607                 case DIF_OP_MOV:
9608                 case DIF_OP_ALLOCS:
9609                         if (r1 >= nregs)
9610                                 err += efunc(pc, "invalid register %u\n", r1);
9611                         if (r2 != 0)
9612                                 err += efunc(pc, "non-zero reserved bits\n");
9613                         if (rd >= nregs)
9614                                 err += efunc(pc, "invalid register %u\n", rd);
9615                         if (rd == 0)
9616                                 err += efunc(pc, "cannot write to %%r0\n");
9617                         break;
9618                 case DIF_OP_LDSB:
9619                 case DIF_OP_LDSH:
9620                 case DIF_OP_LDSW:
9621                 case DIF_OP_LDUB:
9622                 case DIF_OP_LDUH:
9623                 case DIF_OP_LDUW:
9624                 case DIF_OP_LDX:
9625                         if (r1 >= nregs)
9626                                 err += efunc(pc, "invalid register %u\n", r1);
9627                         if (r2 != 0)
9628                                 err += efunc(pc, "non-zero reserved bits\n");
9629                         if (rd >= nregs)
9630                                 err += efunc(pc, "invalid register %u\n", rd);
9631                         if (rd == 0)
9632                                 err += efunc(pc, "cannot write to %%r0\n");
9633                         if (kcheckload)
9634                                 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9635                                     DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9636                         break;
9637                 case DIF_OP_RLDSB:
9638                 case DIF_OP_RLDSH:
9639                 case DIF_OP_RLDSW:
9640                 case DIF_OP_RLDUB:
9641                 case DIF_OP_RLDUH:
9642                 case DIF_OP_RLDUW:
9643                 case DIF_OP_RLDX:
9644                         if (r1 >= nregs)
9645                                 err += efunc(pc, "invalid register %u\n", r1);
9646                         if (r2 != 0)
9647                                 err += efunc(pc, "non-zero reserved bits\n");
9648                         if (rd >= nregs)
9649                                 err += efunc(pc, "invalid register %u\n", rd);
9650                         if (rd == 0)
9651                                 err += efunc(pc, "cannot write to %%r0\n");
9652                         break;
9653                 case DIF_OP_ULDSB:
9654                 case DIF_OP_ULDSH:
9655                 case DIF_OP_ULDSW:
9656                 case DIF_OP_ULDUB:
9657                 case DIF_OP_ULDUH:
9658                 case DIF_OP_ULDUW:
9659                 case DIF_OP_ULDX:
9660                         if (r1 >= nregs)
9661                                 err += efunc(pc, "invalid register %u\n", r1);
9662                         if (r2 != 0)
9663                                 err += efunc(pc, "non-zero reserved bits\n");
9664                         if (rd >= nregs)
9665                                 err += efunc(pc, "invalid register %u\n", rd);
9666                         if (rd == 0)
9667                                 err += efunc(pc, "cannot write to %%r0\n");
9668                         break;
9669                 case DIF_OP_STB:
9670                 case DIF_OP_STH:
9671                 case DIF_OP_STW:
9672                 case DIF_OP_STX:
9673                         if (r1 >= nregs)
9674                                 err += efunc(pc, "invalid register %u\n", r1);
9675                         if (r2 != 0)
9676                                 err += efunc(pc, "non-zero reserved bits\n");
9677                         if (rd >= nregs)
9678                                 err += efunc(pc, "invalid register %u\n", rd);
9679                         if (rd == 0)
9680                                 err += efunc(pc, "cannot write to 0 address\n");
9681                         break;
9682                 case DIF_OP_CMP:
9683                 case DIF_OP_SCMP:
9684                         if (r1 >= nregs)
9685                                 err += efunc(pc, "invalid register %u\n", r1);
9686                         if (r2 >= nregs)
9687                                 err += efunc(pc, "invalid register %u\n", r2);
9688                         if (rd != 0)
9689                                 err += efunc(pc, "non-zero reserved bits\n");
9690                         break;
9691                 case DIF_OP_TST:
9692                         if (r1 >= nregs)
9693                                 err += efunc(pc, "invalid register %u\n", r1);
9694                         if (r2 != 0 || rd != 0)
9695                                 err += efunc(pc, "non-zero reserved bits\n");
9696                         break;
9697                 case DIF_OP_BA:
9698                 case DIF_OP_BE:
9699                 case DIF_OP_BNE:
9700                 case DIF_OP_BG:
9701                 case DIF_OP_BGU:
9702                 case DIF_OP_BGE:
9703                 case DIF_OP_BGEU:
9704                 case DIF_OP_BL:
9705                 case DIF_OP_BLU:
9706                 case DIF_OP_BLE:
9707                 case DIF_OP_BLEU:
9708                         if (label >= dp->dtdo_len) {
9709                                 err += efunc(pc, "invalid branch target %u\n",
9710                                     label);
9711                         }
9712                         if (label <= pc) {
9713                                 err += efunc(pc, "backward branch to %u\n",
9714                                     label);
9715                         }
9716                         break;
9717                 case DIF_OP_RET:
9718                         if (r1 != 0 || r2 != 0)
9719                                 err += efunc(pc, "non-zero reserved bits\n");
9720                         if (rd >= nregs)
9721                                 err += efunc(pc, "invalid register %u\n", rd);
9722                         break;
9723                 case DIF_OP_NOP:
9724                 case DIF_OP_POPTS:
9725                 case DIF_OP_FLUSHTS:
9726                         if (r1 != 0 || r2 != 0 || rd != 0)
9727                                 err += efunc(pc, "non-zero reserved bits\n");
9728                         break;
9729                 case DIF_OP_SETX:
9730                         if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9731                                 err += efunc(pc, "invalid integer ref %u\n",
9732                                     DIF_INSTR_INTEGER(instr));
9733                         }
9734                         if (rd >= nregs)
9735                                 err += efunc(pc, "invalid register %u\n", rd);
9736                         if (rd == 0)
9737                                 err += efunc(pc, "cannot write to %%r0\n");
9738                         break;
9739                 case DIF_OP_SETS:
9740                         if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9741                                 err += efunc(pc, "invalid string ref %u\n",
9742                                     DIF_INSTR_STRING(instr));
9743                         }
9744                         if (rd >= nregs)
9745                                 err += efunc(pc, "invalid register %u\n", rd);
9746                         if (rd == 0)
9747                                 err += efunc(pc, "cannot write to %%r0\n");
9748                         break;
9749                 case DIF_OP_LDGA:
9750                 case DIF_OP_LDTA:
9751                         if (r1 > DIF_VAR_ARRAY_MAX)
9752                                 err += efunc(pc, "invalid array %u\n", r1);
9753                         if (r2 >= nregs)
9754                                 err += efunc(pc, "invalid register %u\n", r2);
9755                         if (rd >= nregs)
9756                                 err += efunc(pc, "invalid register %u\n", rd);
9757                         if (rd == 0)
9758                                 err += efunc(pc, "cannot write to %%r0\n");
9759                         break;
9760                 case DIF_OP_LDGS:
9761                 case DIF_OP_LDTS:
9762                 case DIF_OP_LDLS:
9763                 case DIF_OP_LDGAA:
9764                 case DIF_OP_LDTAA:
9765                         if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9766                                 err += efunc(pc, "invalid variable %u\n", v);
9767                         if (rd >= nregs)
9768                                 err += efunc(pc, "invalid register %u\n", rd);
9769                         if (rd == 0)
9770                                 err += efunc(pc, "cannot write to %%r0\n");
9771                         break;
9772                 case DIF_OP_STGS:
9773                 case DIF_OP_STTS:
9774                 case DIF_OP_STLS:
9775                 case DIF_OP_STGAA:
9776                 case DIF_OP_STTAA:
9777                         if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9778                                 err += efunc(pc, "invalid variable %u\n", v);
9779                         if (rs >= nregs)
9780                                 err += efunc(pc, "invalid register %u\n", rd);
9781                         break;
9782                 case DIF_OP_CALL:
9783                         if (subr > DIF_SUBR_MAX &&
9784                            !(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX))
9785                                 err += efunc(pc, "invalid subr %u\n", subr);
9786                         if (rd >= nregs)
9787                                 err += efunc(pc, "invalid register %u\n", rd);
9788                         if (rd == 0)
9789                                 err += efunc(pc, "cannot write to %%r0\n");
9790
9791                         if (subr == DIF_SUBR_COPYOUT ||
9792                             subr == DIF_SUBR_COPYOUTSTR ||
9793                             subr == DIF_SUBR_KDEBUG_TRACE ||
9794                             subr == DIF_SUBR_KDEBUG_TRACE_STRING) {
9795                                 dp->dtdo_destructive = 1;
9796                         }
9797                         break;
9798                 case DIF_OP_PUSHTR:
9799                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9800                                 err += efunc(pc, "invalid ref type %u\n", type);
9801                         if (r2 >= nregs)
9802                                 err += efunc(pc, "invalid register %u\n", r2);
9803                         if (rs >= nregs)
9804                                 err += efunc(pc, "invalid register %u\n", rs);
9805                         break;
9806                 case DIF_OP_PUSHTV:
9807                         if (type != DIF_TYPE_CTF)
9808                                 err += efunc(pc, "invalid val type %u\n", type);
9809                         if (r2 >= nregs)
9810                                 err += efunc(pc, "invalid register %u\n", r2);
9811                         if (rs >= nregs)
9812                                 err += efunc(pc, "invalid register %u\n", rs);
9813                         break;
9814                 case DIF_OP_STRIP:
9815                         if (r1 >= nregs)
9816                                 err += efunc(pc, "invalid register %u\n", r1);
9817                         if (!dtrace_is_valid_ptrauth_key(r2))
9818                                 err += efunc(pc, "invalid key\n");
9819                         if (rd >= nregs)
9820                                 err += efunc(pc, "invalid register %u\n", rd);
9821                         if (rd == 0)
9822                                 err += efunc(pc, "cannot write to %%r0\n");
9823                         break;
9824                 default:
9825                         err += efunc(pc, "invalid opcode %u\n",
9826                             DIF_INSTR_OP(instr));
9827                 }
9828         }
9829
9830         if (dp->dtdo_len != 0 &&
9831             DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9832                 err += efunc(dp->dtdo_len - 1,
9833                     "expected 'ret' as last DIF instruction\n");
9834         }
9835
9836         if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9837                 /*
9838                  * If we're not returning by reference, the size must be either
9839                  * 0 or the size of one of the base types.
9840                  */
9841                 switch (dp->dtdo_rtype.dtdt_size) {
9842                 case 0:
9843                 case sizeof (uint8_t):
9844                 case sizeof (uint16_t):
9845                 case sizeof (uint32_t):
9846                 case sizeof (uint64_t):
9847                         break;
9848
9849                 default:
9850                         err += efunc(dp->dtdo_len - 1, "bad return size\n");
9851                 }
9852         }
9853
9854         for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9855                 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9856                 dtrace_diftype_t *vt, *et;
9857                 uint_t id;
9858                 int ndx;
9859
9860                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9861                     v->dtdv_scope != DIFV_SCOPE_THREAD &&
9862                     v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9863                         err += efunc(i, "unrecognized variable scope %d\n",
9864                             v->dtdv_scope);
9865                         break;
9866                 }
9867
9868                 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9869                     v->dtdv_kind != DIFV_KIND_SCALAR) {
9870                         err += efunc(i, "unrecognized variable type %d\n",
9871                             v->dtdv_kind);
9872                         break;
9873                 }
9874
9875                 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9876                         err += efunc(i, "%d exceeds variable id limit\n", id);
9877                         break;
9878                 }
9879
9880                 if (id < DIF_VAR_OTHER_UBASE)
9881                         continue;
9882
9883                 /*
9884                  * For user-defined variables, we need to check that this
9885                  * definition is identical to any previous definition that we
9886                  * encountered.
9887                  */
9888                 ndx = id - DIF_VAR_OTHER_UBASE;
9889
9890                 switch (v->dtdv_scope) {
9891                 case DIFV_SCOPE_GLOBAL:
9892                         if (maxglobal == -1 || ndx > maxglobal)
9893                                 maxglobal = ndx;
9894
9895                         if (ndx < vstate->dtvs_nglobals) {
9896                                 dtrace_statvar_t *svar;
9897
9898                                 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9899                                         existing = &svar->dtsv_var;
9900                         }
9901
9902                         break;
9903
9904                 case DIFV_SCOPE_THREAD:
9905                         if (maxtlocal == -1 || ndx > maxtlocal)
9906                                 maxtlocal = ndx;
9907
9908                         if (ndx < vstate->dtvs_ntlocals)
9909                                 existing = &vstate->dtvs_tlocals[ndx];
9910                         break;
9911
9912                 case DIFV_SCOPE_LOCAL:
9913                         if (maxlocal == -1 || ndx > maxlocal)
9914                                 maxlocal = ndx;
9915                         if (ndx < vstate->dtvs_nlocals) {
9916                                 dtrace_statvar_t *svar;
9917
9918                                 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9919                                         existing = &svar->dtsv_var;
9920                         }
9921
9922                         break;
9923                 }
9924
9925                 vt = &v->dtdv_type;
9926
9927                 if (vt->dtdt_flags & DIF_TF_BYREF) {
9928                         if (vt->dtdt_size == 0) {
9929                                 err += efunc(i, "zero-sized variable\n");
9930                                 break;
9931                         }
9932
9933                         if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
9934                             v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
9935                             vt->dtdt_size > dtrace_statvar_maxsize) {
9936                                 err += efunc(i, "oversized by-ref static\n");
9937                                 break;
9938                         }
9939                 }
9940
9941                 if (existing == NULL || existing->dtdv_id == 0)
9942                         continue;
9943
9944                 ASSERT(existing->dtdv_id == v->dtdv_id);
9945                 ASSERT(existing->dtdv_scope == v->dtdv_scope);
9946
9947                 if (existing->dtdv_kind != v->dtdv_kind)
9948                         err += efunc(i, "%d changed variable kind\n", id);
9949
9950                 et = &existing->dtdv_type;
9951
9952                 if (vt->dtdt_flags != et->dtdt_flags) {
9953                         err += efunc(i, "%d changed variable type flags\n", id);
9954                         break;
9955                 }
9956
9957                 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9958                         err += efunc(i, "%d changed variable type size\n", id);
9959                         break;
9960                 }
9961         }
9962
9963         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9964                 dif_instr_t instr = dp->dtdo_buf[pc];
9965
9966                 uint_t v = DIF_INSTR_VAR(instr);
9967                 uint_t op = DIF_INSTR_OP(instr);
9968
9969                 switch (op) {
9970                 case DIF_OP_LDGS:
9971                 case DIF_OP_LDGAA:
9972                 case DIF_OP_STGS:
9973                 case DIF_OP_STGAA:
9974                         if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal))
9975                                 err += efunc(pc, "invalid variable %u\n", v);
9976                         break;
9977                 case DIF_OP_LDTS:
9978                 case DIF_OP_LDTAA:
9979                 case DIF_OP_STTS:
9980                 case DIF_OP_STTAA:
9981                         if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal))
9982                                 err += efunc(pc, "invalid variable %u\n", v);
9983                         break;
9984                 case DIF_OP_LDLS:
9985                 case DIF_OP_STLS:
9986                         if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal))
9987                                 err += efunc(pc, "invalid variable %u\n", v);
9988                         break;
9989                 default:
9990                         break;
9991                 }
9992         }
9993
9994         return (err);
9995 }
9996
9997 /*
9998  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9999  * are much more constrained than normal DIFOs.  Specifically, they may
10000  * not:
10001  *
10002  * 1. Make calls to subroutines other than copyin(), copyinstr() or
10003  *    miscellaneous string routines
10004  * 2. Access DTrace variables other than the args[] array, and the
10005  *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
10006  * 3. Have thread-local variables.
10007  * 4. Have dynamic variables.
10008  */
10009 static int
10010 dtrace_difo_validate_helper(dtrace_difo_t *dp)
10011 {
10012         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
10013         int err = 0;
10014         uint_t pc;
10015
10016         for (pc = 0; pc < dp->dtdo_len; pc++) {
10017                 dif_instr_t instr = dp->dtdo_buf[pc];
10018
10019                 uint_t v = DIF_INSTR_VAR(instr);
10020                 uint_t subr = DIF_INSTR_SUBR(instr);
10021                 uint_t op = DIF_INSTR_OP(instr);
10022
10023                 switch (op) {
10024                 case DIF_OP_OR:
10025                 case DIF_OP_XOR:
10026                 case DIF_OP_AND:
10027                 case DIF_OP_SLL:
10028                 case DIF_OP_SRL:
10029                 case DIF_OP_SRA:
10030                 case DIF_OP_SUB:
10031                 case DIF_OP_ADD:
10032                 case DIF_OP_MUL:
10033                 case DIF_OP_SDIV:
10034                 case DIF_OP_UDIV:
10035                 case DIF_OP_SREM:
10036                 case DIF_OP_UREM:
10037                 case DIF_OP_COPYS:
10038                 case DIF_OP_NOT:
10039                 case DIF_OP_MOV:
10040                 case DIF_OP_RLDSB:
10041                 case DIF_OP_RLDSH:
10042                 case DIF_OP_RLDSW:
10043                 case DIF_OP_RLDUB:
10044                 case DIF_OP_RLDUH:
10045                 case DIF_OP_RLDUW:
10046                 case DIF_OP_RLDX:
10047                 case DIF_OP_ULDSB:
10048                 case DIF_OP_ULDSH:
10049                 case DIF_OP_ULDSW:
10050                 case DIF_OP_ULDUB:
10051                 case DIF_OP_ULDUH:
10052                 case DIF_OP_ULDUW:
10053                 case DIF_OP_ULDX:
10054                 case DIF_OP_STB:
10055                 case DIF_OP_STH:
10056                 case DIF_OP_STW:
10057                 case DIF_OP_STX:
10058                 case DIF_OP_ALLOCS:
10059                 case DIF_OP_CMP:
10060                 case DIF_OP_SCMP:
10061                 case DIF_OP_TST:
10062                 case DIF_OP_BA:
10063                 case DIF_OP_BE:
10064                 case DIF_OP_BNE:
10065                 case DIF_OP_BG:
10066                 case DIF_OP_BGU:
10067                 case DIF_OP_BGE:
10068                 case DIF_OP_BGEU:
10069                 case DIF_OP_BL:
10070                 case DIF_OP_BLU:
10071                 case DIF_OP_BLE:
10072                 case DIF_OP_BLEU:
10073                 case DIF_OP_RET:
10074                 case DIF_OP_NOP:
10075                 case DIF_OP_POPTS:
10076                 case DIF_OP_FLUSHTS:
10077                 case DIF_OP_SETX:
10078                 case DIF_OP_SETS:
10079                 case DIF_OP_LDGA:
10080                 case DIF_OP_LDLS:
10081                 case DIF_OP_STGS:
10082                 case DIF_OP_STLS:
10083                 case DIF_OP_PUSHTR:
10084                 case DIF_OP_PUSHTV:
10085                         break;
10086
10087                 case DIF_OP_LDGS:
10088                         if (v >= DIF_VAR_OTHER_UBASE)
10089                                 break;
10090
10091                         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10092                                 break;
10093
10094                         if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10095                             v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10096                             v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10097                             v == DIF_VAR_UID || v == DIF_VAR_GID)
10098                                 break;
10099
10100                         err += efunc(pc, "illegal variable %u\n", v);
10101                         break;
10102
10103                 case DIF_OP_LDTA:
10104                 case DIF_OP_LDTS:
10105                 case DIF_OP_LDGAA:
10106                 case DIF_OP_LDTAA:
10107                         err += efunc(pc, "illegal dynamic variable load\n");
10108                         break;
10109
10110                 case DIF_OP_STTS:
10111                 case DIF_OP_STGAA:
10112                 case DIF_OP_STTAA:
10113                         err += efunc(pc, "illegal dynamic variable store\n");
10114                         break;
10115
10116                 case DIF_OP_CALL:
10117                         if (subr == DIF_SUBR_ALLOCA ||
10118                             subr == DIF_SUBR_BCOPY ||
10119                             subr == DIF_SUBR_COPYIN ||
10120                             subr == DIF_SUBR_COPYINTO ||
10121                             subr == DIF_SUBR_COPYINSTR ||
10122                             subr == DIF_SUBR_INDEX ||
10123                             subr == DIF_SUBR_INET_NTOA ||
10124                             subr == DIF_SUBR_INET_NTOA6 ||
10125                             subr == DIF_SUBR_INET_NTOP ||
10126                             subr == DIF_SUBR_JSON ||
10127                             subr == DIF_SUBR_LLTOSTR ||
10128                             subr == DIF_SUBR_STRTOLL ||
10129                             subr == DIF_SUBR_RINDEX ||
10130                             subr == DIF_SUBR_STRCHR ||
10131                             subr == DIF_SUBR_STRJOIN ||
10132                             subr == DIF_SUBR_STRRCHR ||
10133                             subr == DIF_SUBR_STRSTR ||
10134                             subr == DIF_SUBR_KDEBUG_TRACE ||
10135                             subr == DIF_SUBR_KDEBUG_TRACE_STRING ||
10136                             subr == DIF_SUBR_HTONS ||
10137                             subr == DIF_SUBR_HTONL ||
10138                             subr == DIF_SUBR_HTONLL ||
10139                             subr == DIF_SUBR_NTOHS ||
10140                             subr == DIF_SUBR_NTOHL ||
10141                             subr == DIF_SUBR_NTOHLL)
10142                                 break;
10143
10144                         err += efunc(pc, "invalid subr %u\n", subr);
10145                         break;
10146
10147                 default:
10148                         err += efunc(pc, "invalid opcode %u\n",
10149                             DIF_INSTR_OP(instr));
10150                 }
10151         }
10152
10153         return (err);
10154 }
10155
10156 /*
10157  * Returns 1 if the expression in the DIF object can be cached on a per-thread
10158  * basis; 0 if not.
10159  */
10160 static int
10161 dtrace_difo_cacheable(dtrace_difo_t *dp)
10162 {
10163         uint_t i;
10164
10165         if (dp == NULL)
10166                 return (0);
10167
10168         for (i = 0; i < dp->dtdo_varlen; i++) {
10169                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10170
10171                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10172                         continue;
10173
10174                 switch (v->dtdv_id) {
10175                 case DIF_VAR_CURTHREAD:
10176                 case DIF_VAR_PID:
10177                 case DIF_VAR_TID:
10178                 case DIF_VAR_EXECNAME:
10179                 case DIF_VAR_ZONENAME:
10180                         break;
10181
10182                 default:
10183                         return (0);
10184                 }
10185         }
10186
10187         /*
10188          * This DIF object may be cacheable.  Now we need to look for any
10189          * array loading instructions, any memory loading instructions, or
10190          * any stores to thread-local variables.
10191          */
10192         for (i = 0; i < dp->dtdo_len; i++) {
10193                 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10194
10195                 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10196                     (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10197                     (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10198                     op == DIF_OP_LDGA || op == DIF_OP_STTS)
10199                         return (0);
10200         }
10201
10202         return (1);
10203 }
10204
10205 static void
10206 dtrace_difo_hold(dtrace_difo_t *dp)
10207 {
10208         uint_t i;
10209
10210         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10211
10212         dp->dtdo_refcnt++;
10213         ASSERT(dp->dtdo_refcnt != 0);
10214
10215         /*
10216          * We need to check this DIF object for references to the variable
10217          * DIF_VAR_VTIMESTAMP.
10218          */
10219         for (i = 0; i < dp->dtdo_varlen; i++) {
10220                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10221
10222                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10223                         continue;
10224
10225                 if (dtrace_vtime_references++ == 0)
10226                         dtrace_vtime_enable();
10227         }
10228 }
10229
10230 /*
10231  * This routine calculates the dynamic variable chunksize for a given DIF
10232  * object.  The calculation is not fool-proof, and can probably be tricked by
10233  * malicious DIF -- but it works for all compiler-generated DIF.  Because this
10234  * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10235  * if a dynamic variable size exceeds the chunksize.
10236  */
10237 static void
10238 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10239 {
10240         uint64_t sval = 0;
10241         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10242         const dif_instr_t *text = dp->dtdo_buf;
10243         uint_t pc, srd = 0;
10244         uint_t ttop = 0;
10245         size_t size, ksize;
10246         uint_t id, i;
10247
10248         for (pc = 0; pc < dp->dtdo_len; pc++) {
10249                 dif_instr_t instr = text[pc];
10250                 uint_t op = DIF_INSTR_OP(instr);
10251                 uint_t rd = DIF_INSTR_RD(instr);
10252                 uint_t r1 = DIF_INSTR_R1(instr);
10253                 uint_t nkeys = 0;
10254                 uchar_t scope;
10255
10256                 dtrace_key_t *key = tupregs;
10257
10258                 switch (op) {
10259                 case DIF_OP_SETX:
10260                         sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10261                         srd = rd;
10262                         continue;
10263
10264                 case DIF_OP_STTS:
10265                         key = &tupregs[DIF_DTR_NREGS];
10266                         key[0].dttk_size = 0;
10267                         key[1].dttk_size = 0;
10268                         nkeys = 2;
10269                         scope = DIFV_SCOPE_THREAD;
10270                         break;
10271
10272                 case DIF_OP_STGAA:
10273                 case DIF_OP_STTAA:
10274                         nkeys = ttop;
10275
10276                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10277                                 key[nkeys++].dttk_size = 0;
10278
10279                         key[nkeys++].dttk_size = 0;
10280
10281                         if (op == DIF_OP_STTAA) {
10282                                 scope = DIFV_SCOPE_THREAD;
10283                         } else {
10284                                 scope = DIFV_SCOPE_GLOBAL;
10285                         }
10286
10287                         break;
10288
10289                 case DIF_OP_PUSHTR:
10290                         if (ttop == DIF_DTR_NREGS)
10291                                 return;
10292
10293                         if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10294                                 /*
10295                                  * If the register for the size of the "pushtr"
10296                                  * is %r0 (or the value is 0) and the type is
10297                                  * a string, we'll use the system-wide default
10298                                  * string size.
10299                                  */
10300                                 tupregs[ttop++].dttk_size =
10301                                     dtrace_strsize_default;
10302                         } else {
10303                                 if (srd == 0)
10304                                         return;
10305
10306                                 if (sval > LONG_MAX)
10307                                         return;
10308
10309                                 tupregs[ttop++].dttk_size = sval;
10310                         }
10311
10312                         break;
10313
10314                 case DIF_OP_PUSHTV:
10315                         if (ttop == DIF_DTR_NREGS)
10316                                 return;
10317
10318                         tupregs[ttop++].dttk_size = 0;
10319                         break;
10320
10321                 case DIF_OP_FLUSHTS:
10322                         ttop = 0;
10323                         break;
10324
10325                 case DIF_OP_POPTS:
10326                         if (ttop != 0)
10327                                 ttop--;
10328                         break;
10329                 }
10330
10331                 sval = 0;
10332                 srd = 0;
10333
10334                 if (nkeys == 0)
10335                         continue;
10336
10337                 /*
10338                  * We have a dynamic variable allocation; calculate its size.
10339                  */
10340                 for (ksize = 0, i = 0; i < nkeys; i++)
10341                         ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10342
10343                 size = sizeof (dtrace_dynvar_t);
10344                 size += sizeof (dtrace_key_t) * (nkeys - 1);
10345                 size += ksize;
10346
10347                 /*
10348                  * Now we need to determine the size of the stored data.
10349                  */
10350                 id = DIF_INSTR_VAR(instr);
10351
10352                 for (i = 0; i < dp->dtdo_varlen; i++) {
10353                         dtrace_difv_t *v = &dp->dtdo_vartab[i];
10354
10355                         if (v->dtdv_id == id && v->dtdv_scope == scope) {
10356                                 size += v->dtdv_type.dtdt_size;
10357                                 break;
10358                         }
10359                 }
10360
10361                 if (i == dp->dtdo_varlen)
10362                         return;
10363
10364                 /*
10365                  * We have the size.  If this is larger than the chunk size
10366                  * for our dynamic variable state, reset the chunk size.
10367                  */
10368                 size = P2ROUNDUP(size, sizeof (uint64_t));
10369
10370                 /*
10371                  * Before setting the chunk size, check that we're not going
10372                  * to set it to a negative value...
10373                  */
10374                 if (size > LONG_MAX)
10375                         return;
10376
10377                 /*
10378                  * ...and make certain that we didn't badly overflow.
10379                  */
10380                 if (size < ksize || size < sizeof (dtrace_dynvar_t))
10381                         return;
10382
10383                 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10384                         vstate->dtvs_dynvars.dtds_chunksize = size;
10385         }
10386 }
10387
10388 static void
10389 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10390 {
10391         int oldsvars, osz, nsz, otlocals, ntlocals;
10392         uint_t i, id;
10393
10394         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10395         ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10396
10397         for (i = 0; i < dp->dtdo_varlen; i++) {
10398                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10399                 dtrace_statvar_t *svar;
10400                 dtrace_statvar_t ***svarp = NULL;
10401                 size_t dsize = 0;
10402                 uint8_t scope = v->dtdv_scope;
10403                 int *np = (int *)NULL;
10404
10405                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10406                         continue;
10407
10408                 id -= DIF_VAR_OTHER_UBASE;
10409
10410                 switch (scope) {
10411                 case DIFV_SCOPE_THREAD:
10412                         while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
10413                                 dtrace_difv_t *tlocals;
10414
10415                                 if ((ntlocals = (otlocals << 1)) == 0)
10416                                         ntlocals = 1;
10417
10418                                 osz = otlocals * sizeof (dtrace_difv_t);
10419                                 nsz = ntlocals * sizeof (dtrace_difv_t);
10420
10421                                 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10422
10423                                 if (osz != 0) {
10424                                         bcopy(vstate->dtvs_tlocals,
10425                                             tlocals, osz);
10426                                         kmem_free(vstate->dtvs_tlocals, osz);
10427                                 }
10428
10429                                 vstate->dtvs_tlocals = tlocals;
10430                                 vstate->dtvs_ntlocals = ntlocals;
10431                         }
10432
10433                         vstate->dtvs_tlocals[id] = *v;
10434                         continue;
10435
10436                 case DIFV_SCOPE_LOCAL:
10437                         np = &vstate->dtvs_nlocals;
10438                         svarp = &vstate->dtvs_locals;
10439
10440                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10441                                 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
10442                                     sizeof (uint64_t));
10443                         else
10444                                 dsize = (int)NCPU * sizeof (uint64_t);
10445
10446                         break;
10447
10448                 case DIFV_SCOPE_GLOBAL:
10449                         np = &vstate->dtvs_nglobals;
10450                         svarp = &vstate->dtvs_globals;
10451
10452                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10453                                 dsize = v->dtdv_type.dtdt_size +
10454                                     sizeof (uint64_t);
10455
10456                         break;
10457
10458                 default:
10459                         ASSERT(0);
10460                 }
10461
10462                 while (id >= (uint_t)(oldsvars = *np)) {
10463                         dtrace_statvar_t **statics;
10464                         int newsvars, oldsize, newsize;
10465
10466                         if ((newsvars = (oldsvars << 1)) == 0)
10467                                 newsvars = 1;
10468
10469                         oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10470                         newsize = newsvars * sizeof (dtrace_statvar_t *);
10471
10472                         statics = kmem_zalloc(newsize, KM_SLEEP);
10473
10474                         if (oldsize != 0) {
10475                                 bcopy(*svarp, statics, oldsize);
10476                                 kmem_free(*svarp, oldsize);
10477                         }
10478
10479                         *svarp = statics;
10480                         *np = newsvars;
10481                 }
10482
10483                 if ((svar = (*svarp)[id]) == NULL) {
10484                         svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10485                         svar->dtsv_var = *v;
10486
10487                         if ((svar->dtsv_size = dsize) != 0) {
10488                                 svar->dtsv_data = (uint64_t)(uintptr_t)
10489                                     kmem_zalloc(dsize, KM_SLEEP);
10490                         }
10491
10492                         (*svarp)[id] = svar;
10493                 }
10494
10495                 svar->dtsv_refcnt++;
10496         }
10497
10498         dtrace_difo_chunksize(dp, vstate);
10499         dtrace_difo_hold(dp);
10500 }
10501
10502 static dtrace_difo_t *
10503 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10504 {
10505         dtrace_difo_t *new;
10506         size_t sz;
10507
10508         ASSERT(dp->dtdo_buf != NULL);
10509         ASSERT(dp->dtdo_refcnt != 0);
10510
10511         new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10512
10513         ASSERT(dp->dtdo_buf != NULL);
10514         sz = dp->dtdo_len * sizeof (dif_instr_t);
10515         new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10516         bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10517         new->dtdo_len = dp->dtdo_len;
10518
10519         if (dp->dtdo_strtab != NULL) {
10520                 ASSERT(dp->dtdo_strlen != 0);
10521                 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10522                 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10523                 new->dtdo_strlen = dp->dtdo_strlen;
10524         }
10525
10526         if (dp->dtdo_inttab != NULL) {
10527                 ASSERT(dp->dtdo_intlen != 0);
10528                 sz = dp->dtdo_intlen * sizeof (uint64_t);
10529                 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10530                 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10531                 new->dtdo_intlen = dp->dtdo_intlen;
10532         }
10533
10534         if (dp->dtdo_vartab != NULL) {
10535                 ASSERT(dp->dtdo_varlen != 0);
10536                 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10537                 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10538                 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10539                 new->dtdo_varlen = dp->dtdo_varlen;
10540         }
10541
10542         dtrace_difo_init(new, vstate);
10543         return (new);
10544 }
10545
10546 static void
10547 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10548 {
10549         uint_t i;
10550
10551         ASSERT(dp->dtdo_refcnt == 0);
10552
10553         for (i = 0; i < dp->dtdo_varlen; i++) {
10554                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10555                 dtrace_statvar_t *svar;
10556                 dtrace_statvar_t **svarp = NULL;
10557                 uint_t id;
10558                 uint8_t scope = v->dtdv_scope;
10559                 int *np = NULL;
10560
10561                 switch (scope) {
10562                 case DIFV_SCOPE_THREAD:
10563                         continue;
10564
10565                 case DIFV_SCOPE_LOCAL:
10566                         np = &vstate->dtvs_nlocals;
10567                         svarp = vstate->dtvs_locals;
10568                         break;
10569
10570                 case DIFV_SCOPE_GLOBAL:
10571                         np = &vstate->dtvs_nglobals;
10572                         svarp = vstate->dtvs_globals;
10573                         break;
10574
10575                 default:
10576                         ASSERT(0);
10577                 }
10578
10579                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10580                         continue;
10581
10582                 id -= DIF_VAR_OTHER_UBASE;
10583
10584                 ASSERT(id < (uint_t)*np);
10585
10586                 svar = svarp[id];
10587                 ASSERT(svar != NULL);
10588                 ASSERT(svar->dtsv_refcnt > 0);
10589
10590                 if (--svar->dtsv_refcnt > 0)
10591                         continue;
10592
10593                 if (svar->dtsv_size != 0) {
10594                         ASSERT(svar->dtsv_data != 0);
10595                         kmem_free((void *)(uintptr_t)svar->dtsv_data,
10596                             svar->dtsv_size);
10597                 }
10598
10599                 kmem_free(svar, sizeof (dtrace_statvar_t));
10600                 svarp[id] = NULL;
10601         }
10602
10603         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10604         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10605         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10606         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10607
10608         kmem_free(dp, sizeof (dtrace_difo_t));
10609 }
10610
10611 static void
10612 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10613 {
10614         uint_t i;
10615
10616         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10617         ASSERT(dp->dtdo_refcnt != 0);
10618
10619         for (i = 0; i < dp->dtdo_varlen; i++) {
10620                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10621
10622                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10623                         continue;
10624
10625                 ASSERT(dtrace_vtime_references > 0);
10626                 if (--dtrace_vtime_references == 0)
10627                         dtrace_vtime_disable();
10628         }
10629
10630         if (--dp->dtdo_refcnt == 0)
10631                 dtrace_difo_destroy(dp, vstate);
10632 }
10633
10634 /*
10635  * DTrace Format Functions
10636  */
10637 static uint16_t
10638 dtrace_format_add(dtrace_state_t *state, char *str)
10639 {
10640         char *fmt, **new;
10641         uint16_t ndx, len = strlen(str) + 1;
10642
10643         fmt = kmem_zalloc(len, KM_SLEEP);
10644         bcopy(str, fmt, len);
10645
10646         for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10647                 if (state->dts_formats[ndx] == NULL) {
10648                         state->dts_formats[ndx] = fmt;
10649                         return (ndx + 1);
10650                 }
10651         }
10652
10653         if (state->dts_nformats == USHRT_MAX) {
10654                 /*
10655                  * This is only likely if a denial-of-service attack is being
10656                  * attempted.  As such, it's okay to fail silently here.
10657                  */
10658                 kmem_free(fmt, len);
10659                 return (0);
10660         }
10661
10662         /*
10663          * For simplicity, we always resize the formats array to be exactly the
10664          * number of formats.
10665          */
10666         ndx = state->dts_nformats++;
10667         new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10668
10669         if (state->dts_formats != NULL) {
10670                 ASSERT(ndx != 0);
10671                 bcopy(state->dts_formats, new, ndx * sizeof (char *));
10672                 kmem_free(state->dts_formats, ndx * sizeof (char *));
10673         }
10674
10675         state->dts_formats = new;
10676         state->dts_formats[ndx] = fmt;
10677
10678         return (ndx + 1);
10679 }
10680
10681 static void
10682 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10683 {
10684         char *fmt;
10685
10686         ASSERT(state->dts_formats != NULL);
10687         ASSERT(format <= state->dts_nformats);
10688         ASSERT(state->dts_formats[format - 1] != NULL);
10689
10690         fmt = state->dts_formats[format - 1];
10691         kmem_free(fmt, strlen(fmt) + 1);
10692         state->dts_formats[format - 1] = NULL;
10693 }
10694
10695 static void
10696 dtrace_format_destroy(dtrace_state_t *state)
10697 {
10698         int i;
10699
10700         if (state->dts_nformats == 0) {
10701                 ASSERT(state->dts_formats == NULL);
10702                 return;
10703         }
10704
10705         ASSERT(state->dts_formats != NULL);
10706
10707         for (i = 0; i < state->dts_nformats; i++) {
10708                 char *fmt = state->dts_formats[i];
10709
10710                 if (fmt == NULL)
10711                         continue;
10712
10713                 kmem_free(fmt, strlen(fmt) + 1);
10714         }
10715
10716         kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10717         state->dts_nformats = 0;
10718         state->dts_formats = NULL;
10719 }
10720
10721 /*
10722  * DTrace Predicate Functions
10723  */
10724 static dtrace_predicate_t *
10725 dtrace_predicate_create(dtrace_difo_t *dp)
10726 {
10727         dtrace_predicate_t *pred;
10728
10729         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10730         ASSERT(dp->dtdo_refcnt != 0);
10731
10732         pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10733         pred->dtp_difo = dp;
10734         pred->dtp_refcnt = 1;
10735
10736         if (!dtrace_difo_cacheable(dp))
10737                 return (pred);
10738
10739         if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10740                 /*
10741                  * This is only theoretically possible -- we have had 2^32
10742                  * cacheable predicates on this machine.  We cannot allow any
10743                  * more predicates to become cacheable:  as unlikely as it is,
10744                  * there may be a thread caching a (now stale) predicate cache
10745                  * ID. (N.B.: the temptation is being successfully resisted to
10746                  * have this cmn_err() "Holy shit -- we executed this code!")
10747                  */
10748                 return (pred);
10749         }
10750
10751         pred->dtp_cacheid = dtrace_predcache_id++;
10752
10753         return (pred);
10754 }
10755
10756 static void
10757 dtrace_predicate_hold(dtrace_predicate_t *pred)
10758 {
10759         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10760         ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10761         ASSERT(pred->dtp_refcnt > 0);
10762
10763         pred->dtp_refcnt++;
10764 }
10765
10766 static void
10767 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10768 {
10769         dtrace_difo_t *dp = pred->dtp_difo;
10770 #pragma unused(dp) /* __APPLE__ */
10771
10772         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10773         ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10774         ASSERT(pred->dtp_refcnt > 0);
10775
10776         if (--pred->dtp_refcnt == 0) {
10777                 dtrace_difo_release(pred->dtp_difo, vstate);
10778                 kmem_free(pred, sizeof (dtrace_predicate_t));
10779         }
10780 }
10781
10782 /*
10783  * DTrace Action Description Functions
10784  */
10785 static dtrace_actdesc_t *
10786 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10787     uint64_t uarg, uint64_t arg)
10788 {
10789         dtrace_actdesc_t *act;
10790
10791         ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != 0 &&
10792             arg >= KERNELBASE) || (arg == 0 && kind == DTRACEACT_PRINTA));
10793
10794         act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10795         act->dtad_kind = kind;
10796         act->dtad_ntuple = ntuple;
10797         act->dtad_uarg = uarg;
10798         act->dtad_arg = arg;
10799         act->dtad_refcnt = 1;
10800
10801         return (act);
10802 }
10803
10804 static void
10805 dtrace_actdesc_hold(dtrace_actdesc_t *act)
10806 {
10807         ASSERT(act->dtad_refcnt >= 1);
10808         act->dtad_refcnt++;
10809 }
10810
10811 static void
10812 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10813 {
10814         dtrace_actkind_t kind = act->dtad_kind;
10815         dtrace_difo_t *dp;
10816
10817         ASSERT(act->dtad_refcnt >= 1);
10818
10819         if (--act->dtad_refcnt != 0)
10820                 return;
10821
10822         if ((dp = act->dtad_difo) != NULL)
10823                 dtrace_difo_release(dp, vstate);
10824
10825         if (DTRACEACT_ISPRINTFLIKE(kind)) {
10826                 char *str = (char *)(uintptr_t)act->dtad_arg;
10827
10828                 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10829                     (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10830
10831                 if (str != NULL)
10832                         kmem_free(str, strlen(str) + 1);
10833         }
10834
10835         kmem_free(act, sizeof (dtrace_actdesc_t));
10836 }
10837
10838 /*
10839  * DTrace ECB Functions
10840  */
10841 static dtrace_ecb_t *
10842 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10843 {
10844         dtrace_ecb_t *ecb;
10845         dtrace_epid_t epid;
10846
10847         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10848
10849         ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10850         ecb->dte_predicate = NULL;
10851         ecb->dte_probe = probe;
10852
10853         /*
10854          * The default size is the size of the default action: recording
10855          * the header.
10856          */
10857         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10858         ecb->dte_alignment = sizeof (dtrace_epid_t);
10859
10860         epid = state->dts_epid++;
10861
10862         if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
10863                 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10864                 int necbs = state->dts_necbs << 1;
10865
10866                 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
10867
10868                 if (necbs == 0) {
10869                         ASSERT(oecbs == NULL);
10870                         necbs = 1;
10871                 }
10872
10873                 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10874
10875                 if (oecbs != NULL)
10876                         bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10877
10878                 dtrace_membar_producer();
10879                 state->dts_ecbs = ecbs;
10880
10881                 if (oecbs != NULL) {
10882                         /*
10883                          * If this state is active, we must dtrace_sync()
10884                          * before we can free the old dts_ecbs array:  we're
10885                          * coming in hot, and there may be active ring
10886                          * buffer processing (which indexes into the dts_ecbs
10887                          * array) on another CPU.
10888                          */
10889                         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10890                                 dtrace_sync();
10891
10892                         kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10893                 }
10894
10895                 dtrace_membar_producer();
10896                 state->dts_necbs = necbs;
10897         }
10898
10899         ecb->dte_state = state;
10900
10901         ASSERT(state->dts_ecbs[epid - 1] == NULL);
10902         dtrace_membar_producer();
10903         state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10904
10905         return (ecb);
10906 }
10907
10908 static int
10909 dtrace_ecb_enable(dtrace_ecb_t *ecb)
10910 {
10911         dtrace_probe_t *probe = ecb->dte_probe;
10912
10913         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
10914         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10915         ASSERT(ecb->dte_next == NULL);
10916
10917         if (probe == NULL) {
10918                 /*
10919                  * This is the NULL probe -- there's nothing to do.
10920                  */
10921             return(0);
10922         }
10923
10924         probe->dtpr_provider->dtpv_ecb_count++;
10925         if (probe->dtpr_ecb == NULL) {
10926                 dtrace_provider_t *prov = probe->dtpr_provider;
10927
10928                 /*
10929                  * We're the first ECB on this probe.
10930                  */
10931                 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10932
10933                 if (ecb->dte_predicate != NULL)
10934                         probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10935
10936                 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10937                     probe->dtpr_id, probe->dtpr_arg));
10938         } else {
10939                 /*
10940                  * This probe is already active.  Swing the last pointer to
10941                  * point to the new ECB, and issue a dtrace_sync() to assure
10942                  * that all CPUs have seen the change.
10943                  */
10944                 ASSERT(probe->dtpr_ecb_last != NULL);
10945                 probe->dtpr_ecb_last->dte_next = ecb;
10946                 probe->dtpr_ecb_last = ecb;
10947                 probe->dtpr_predcache = 0;
10948
10949                 dtrace_sync();
10950                 return(0);
10951         }
10952 }
10953
10954 static int
10955 dtrace_ecb_resize(dtrace_ecb_t *ecb)
10956 {
10957         dtrace_action_t *act;
10958         uint32_t curneeded = UINT32_MAX;
10959         uint32_t aggbase = UINT32_MAX;
10960
10961         /*
10962          * If we record anything, we always record the dtrace_rechdr_t.  (And
10963          * we always record it first.)
10964          */
10965         ecb->dte_size = sizeof (dtrace_rechdr_t);
10966         ecb->dte_alignment = sizeof (dtrace_epid_t);
10967
10968         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10969                 dtrace_recdesc_t *rec = &act->dta_rec;
10970                 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10971
10972                 ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
10973
10974                 if (DTRACEACT_ISAGG(act->dta_kind)) {
10975                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10976
10977                         ASSERT(rec->dtrd_size != 0);
10978                         ASSERT(agg->dtag_first != NULL);
10979                         ASSERT(act->dta_prev->dta_intuple);
10980                         ASSERT(aggbase != UINT32_MAX);
10981                         ASSERT(curneeded != UINT32_MAX);
10982
10983                         agg->dtag_base = aggbase;
10984                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10985                         rec->dtrd_offset = curneeded;
10986                         if (curneeded + rec->dtrd_size < curneeded)
10987                                 return (EINVAL);
10988                         curneeded += rec->dtrd_size;
10989                         ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10990
10991                         aggbase = UINT32_MAX;
10992                         curneeded = UINT32_MAX;
10993                 } else if (act->dta_intuple) {
10994                         if (curneeded == UINT32_MAX) {
10995                                 /*
10996                                  * This is the first record in a tuple.  Align
10997                                  * curneeded to be at offset 4 in an 8-byte
10998                                  * aligned block.
10999                                  */
11000                                 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11001                                 ASSERT(aggbase == UINT32_MAX);
11002
11003                                 curneeded = P2PHASEUP(ecb->dte_size,
11004                                     sizeof (uint64_t), sizeof (dtrace_aggid_t));
11005
11006                                 aggbase = curneeded - sizeof (dtrace_aggid_t);
11007                                 ASSERT(IS_P2ALIGNED(aggbase,
11008                                     sizeof (uint64_t)));
11009                         }
11010
11011                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11012                         rec->dtrd_offset = curneeded;
11013                         curneeded += rec->dtrd_size;
11014                         if (curneeded + rec->dtrd_size < curneeded)
11015                                 return (EINVAL);
11016                 } else {
11017                         /* tuples must be followed by an aggregation */
11018                         ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11019                         ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
11020                         rec->dtrd_offset = ecb->dte_size;
11021                         if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
11022                                 return (EINVAL);
11023                         ecb->dte_size += rec->dtrd_size;
11024                         ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
11025                 }
11026         }
11027
11028         if ((act = ecb->dte_action) != NULL &&
11029             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
11030             ecb->dte_size == sizeof (dtrace_rechdr_t)) {
11031                 /*
11032                  * If the size is still sizeof (dtrace_rechdr_t), then all
11033                  * actions store no data; set the size to 0.
11034                  */
11035                 ecb->dte_size = 0;
11036         }
11037
11038         ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
11039         ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
11040         ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
11041         return (0);
11042 }
11043
11044 static dtrace_action_t *
11045 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11046 {
11047         dtrace_aggregation_t *agg;
11048         size_t size = sizeof (uint64_t);
11049         int ntuple = desc->dtad_ntuple;
11050         dtrace_action_t *act;
11051         dtrace_recdesc_t *frec;
11052         dtrace_aggid_t aggid;
11053         dtrace_state_t *state = ecb->dte_state;
11054
11055         agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
11056         agg->dtag_ecb = ecb;
11057
11058         ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
11059
11060         switch (desc->dtad_kind) {
11061         case DTRACEAGG_MIN:
11062                 agg->dtag_initial = INT64_MAX;
11063                 agg->dtag_aggregate = dtrace_aggregate_min;
11064                 break;
11065
11066         case DTRACEAGG_MAX:
11067                 agg->dtag_initial = INT64_MIN;
11068                 agg->dtag_aggregate = dtrace_aggregate_max;
11069                 break;
11070
11071         case DTRACEAGG_COUNT:
11072                 agg->dtag_aggregate = dtrace_aggregate_count;
11073                 break;
11074
11075         case DTRACEAGG_QUANTIZE:
11076                 agg->dtag_aggregate = dtrace_aggregate_quantize;
11077                 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11078                     sizeof (uint64_t);
11079                 break;
11080
11081         case DTRACEAGG_LQUANTIZE: {
11082                 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11083                 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11084
11085                 agg->dtag_initial = desc->dtad_arg;
11086                 agg->dtag_aggregate = dtrace_aggregate_lquantize;
11087
11088                 if (step == 0 || levels == 0)
11089                         goto err;
11090
11091                 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11092                 break;
11093         }
11094
11095         case DTRACEAGG_LLQUANTIZE: {
11096                 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11097                 uint16_t low    = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11098                 uint16_t high   = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11099                 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11100                 int64_t v;
11101
11102                 agg->dtag_initial = desc->dtad_arg;
11103                 agg->dtag_aggregate = dtrace_aggregate_llquantize;
11104
11105                 if (factor < 2 || low >= high || nsteps < factor)
11106                         goto err;
11107
11108                 /*
11109                  * Now check that the number of steps evenly divides a power
11110                  * of the factor.  (This assures both integer bucket size and
11111                  * linearity within each magnitude.)
11112                  */
11113                 for (v = factor; v < nsteps; v *= factor)
11114                         continue;
11115
11116                 if ((v % nsteps) || (nsteps % factor))
11117                         goto err;
11118
11119                 size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11120                 break;
11121   }
11122
11123         case DTRACEAGG_AVG:
11124                 agg->dtag_aggregate = dtrace_aggregate_avg;
11125                 size = sizeof (uint64_t) * 2;
11126                 break;
11127
11128         case DTRACEAGG_STDDEV:
11129                 agg->dtag_aggregate = dtrace_aggregate_stddev;
11130                 size = sizeof (uint64_t) * 4;
11131                 break;
11132
11133         case DTRACEAGG_SUM:
11134                 agg->dtag_aggregate = dtrace_aggregate_sum;
11135                 break;
11136
11137         default:
11138                 goto err;
11139         }
11140
11141         agg->dtag_action.dta_rec.dtrd_size = size;
11142
11143         if (ntuple == 0)
11144                 goto err;
11145
11146         /*
11147          * We must make sure that we have enough actions for the n-tuple.
11148          */
11149         for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11150                 if (DTRACEACT_ISAGG(act->dta_kind))
11151                         break;
11152
11153                 if (--ntuple == 0) {
11154                         /*
11155                          * This is the action with which our n-tuple begins.
11156                          */
11157                         agg->dtag_first = act;
11158                         goto success;
11159                 }
11160         }
11161
11162         /*
11163          * This n-tuple is short by ntuple elements.  Return failure.
11164          */
11165         ASSERT(ntuple != 0);
11166 err:
11167         kmem_free(agg, sizeof (dtrace_aggregation_t));
11168         return (NULL);
11169
11170 success:
11171         /*
11172          * If the last action in the tuple has a size of zero, it's actually
11173          * an expression argument for the aggregating action.
11174          */
11175         ASSERT(ecb->dte_action_last != NULL);
11176         act = ecb->dte_action_last;
11177
11178         if (act->dta_kind == DTRACEACT_DIFEXPR) {
11179                 ASSERT(act->dta_difo != NULL);
11180
11181                 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11182                         agg->dtag_hasarg = 1;
11183         }
11184
11185         /*
11186          * We need to allocate an id for this aggregation.
11187          */
11188         aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11189             VM_BESTFIT | VM_SLEEP);
11190
11191         if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
11192                 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11193                 dtrace_aggregation_t **aggs;
11194                 int naggs = state->dts_naggregations << 1;
11195                 int onaggs = state->dts_naggregations;
11196
11197                 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
11198
11199                 if (naggs == 0) {
11200                         ASSERT(oaggs == NULL);
11201                         naggs = 1;
11202                 }
11203
11204                 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11205
11206                 if (oaggs != NULL) {
11207                         bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11208                         kmem_free(oaggs, onaggs * sizeof (*aggs));
11209                 }
11210
11211                 state->dts_aggregations = aggs;
11212                 state->dts_naggregations = naggs;
11213         }
11214
11215         ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11216         state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11217
11218         frec = &agg->dtag_first->dta_rec;
11219         if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11220                 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11221
11222         for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11223                 ASSERT(!act->dta_intuple);
11224                 act->dta_intuple = 1;
11225         }
11226
11227         return (&agg->dtag_action);
11228 }
11229
11230 static void
11231 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11232 {
11233         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11234         dtrace_state_t *state = ecb->dte_state;
11235         dtrace_aggid_t aggid = agg->dtag_id;
11236
11237         ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11238         vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11239
11240         ASSERT(state->dts_aggregations[aggid - 1] == agg);
11241         state->dts_aggregations[aggid - 1] = NULL;
11242
11243         kmem_free(agg, sizeof (dtrace_aggregation_t));
11244 }
11245
11246 static int
11247 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11248 {
11249         dtrace_action_t *action, *last;
11250         dtrace_difo_t *dp = desc->dtad_difo;
11251         uint32_t size = 0, align = sizeof (uint8_t), mask;
11252         uint16_t format = 0;
11253         dtrace_recdesc_t *rec;
11254         dtrace_state_t *state = ecb->dte_state;
11255         dtrace_optval_t *opt = state->dts_options;
11256         dtrace_optval_t nframes=0, strsize;
11257         uint64_t arg = desc->dtad_arg;
11258
11259         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11260         ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11261
11262         if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11263                 /*
11264                  * If this is an aggregating action, there must be neither
11265                  * a speculate nor a commit on the action chain.
11266                  */
11267                 dtrace_action_t *act;
11268
11269                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11270                         if (act->dta_kind == DTRACEACT_COMMIT)
11271                                 return (EINVAL);
11272
11273                         if (act->dta_kind == DTRACEACT_SPECULATE)
11274                                 return (EINVAL);
11275                 }
11276
11277                 action = dtrace_ecb_aggregation_create(ecb, desc);
11278
11279                 if (action == NULL)
11280                         return (EINVAL);
11281         } else {
11282                 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11283                     (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11284                     dp != NULL && dp->dtdo_destructive)) {
11285                         state->dts_destructive = 1;
11286                 }
11287
11288                 switch (desc->dtad_kind) {
11289                 case DTRACEACT_PRINTF:
11290                 case DTRACEACT_PRINTA:
11291                 case DTRACEACT_SYSTEM:
11292                 case DTRACEACT_FREOPEN:
11293                 case DTRACEACT_DIFEXPR:
11294                         /*
11295                          * We know that our arg is a string -- turn it into a
11296                          * format.
11297                          */
11298                         if (arg == 0) {
11299                                 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11300                                        desc->dtad_kind == DTRACEACT_DIFEXPR);
11301                                 format = 0;
11302                         } else {
11303                                 ASSERT(arg != 0);
11304                                 ASSERT(arg > KERNELBASE);
11305                                 format = dtrace_format_add(state,
11306                                     (char *)(uintptr_t)arg);
11307                         }
11308
11309                         /*FALLTHROUGH*/
11310                 case DTRACEACT_LIBACT:
11311                 case DTRACEACT_TRACEMEM:
11312                 case DTRACEACT_TRACEMEM_DYNSIZE:
11313                 case DTRACEACT_APPLEBINARY:     /* __APPLE__ */
11314                         if (dp == NULL)
11315                                 return (EINVAL);
11316
11317                         if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11318                                 break;
11319
11320                         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11321                                 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11322                                         return (EINVAL);
11323
11324                                 size = opt[DTRACEOPT_STRSIZE];
11325                         }
11326
11327                         break;
11328
11329                 case DTRACEACT_STACK:
11330                         if ((nframes = arg) == 0) {
11331                                 nframes = opt[DTRACEOPT_STACKFRAMES];
11332                                 ASSERT(nframes > 0);
11333                                 arg = nframes;
11334                         }
11335
11336                         size = nframes * sizeof (pc_t);
11337                         break;
11338
11339                 case DTRACEACT_JSTACK:
11340                         if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11341                                 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11342
11343                         if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11344                                 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11345
11346                         arg = DTRACE_USTACK_ARG(nframes, strsize);
11347
11348                         /*FALLTHROUGH*/
11349                 case DTRACEACT_USTACK:
11350                         if (desc->dtad_kind != DTRACEACT_JSTACK &&
11351                             (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11352                                 strsize = DTRACE_USTACK_STRSIZE(arg);
11353                                 nframes = opt[DTRACEOPT_USTACKFRAMES];
11354                                 ASSERT(nframes > 0);
11355                                 arg = DTRACE_USTACK_ARG(nframes, strsize);
11356                         }
11357
11358                         /*
11359                          * Save a slot for the pid.
11360                          */
11361                         size = (nframes + 1) * sizeof (uint64_t);
11362                         size += DTRACE_USTACK_STRSIZE(arg);
11363                         size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11364
11365                         break;
11366
11367                 case DTRACEACT_SYM:
11368                 case DTRACEACT_MOD:
11369                         if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11370                             sizeof (uint64_t)) ||
11371                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11372                                 return (EINVAL);
11373                         break;
11374
11375                 case DTRACEACT_USYM:
11376                 case DTRACEACT_UMOD:
11377                 case DTRACEACT_UADDR:
11378                         if (dp == NULL ||
11379                             (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11380                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11381                                 return (EINVAL);
11382
11383                         /*
11384                          * We have a slot for the pid, plus a slot for the
11385                          * argument.  To keep things simple (aligned with
11386                          * bitness-neutral sizing), we store each as a 64-bit
11387                          * quantity.
11388                          */
11389                         size = 2 * sizeof (uint64_t);
11390                         break;
11391
11392                 case DTRACEACT_STOP:
11393                 case DTRACEACT_BREAKPOINT:
11394                 case DTRACEACT_PANIC:
11395                         break;
11396
11397                 case DTRACEACT_CHILL:
11398                 case DTRACEACT_DISCARD:
11399                 case DTRACEACT_RAISE:
11400                 case DTRACEACT_PIDRESUME:       /* __APPLE__ */
11401                         if (dp == NULL)
11402                                 return (EINVAL);
11403                         break;
11404
11405                 case DTRACEACT_EXIT:
11406                         if (dp == NULL ||
11407                             (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11408                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11409                                 return (EINVAL);
11410                         break;
11411
11412                 case DTRACEACT_SPECULATE:
11413                         if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11414                                 return (EINVAL);
11415
11416                         if (dp == NULL)
11417                                 return (EINVAL);
11418
11419                         state->dts_speculates = 1;
11420                         break;
11421
11422                 case DTRACEACT_COMMIT: {
11423                         dtrace_action_t *act = ecb->dte_action;
11424
11425                         for (; act != NULL; act = act->dta_next) {
11426                                 if (act->dta_kind == DTRACEACT_COMMIT)
11427                                         return (EINVAL);
11428                         }
11429
11430                         if (dp == NULL)
11431                                 return (EINVAL);
11432                         break;
11433                 }
11434
11435                 default:
11436                         return (EINVAL);
11437                 }
11438
11439                 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11440                         /*
11441                          * If this is a data-storing action or a speculate,
11442                          * we must be sure that there isn't a commit on the
11443                          * action chain.
11444                          */
11445                         dtrace_action_t *act = ecb->dte_action;
11446
11447                         for (; act != NULL; act = act->dta_next) {
11448                                 if (act->dta_kind == DTRACEACT_COMMIT)
11449                                         return (EINVAL);
11450                         }
11451                 }
11452
11453                 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11454                 action->dta_rec.dtrd_size = size;
11455         }
11456
11457         action->dta_refcnt = 1;
11458         rec = &action->dta_rec;
11459         size = rec->dtrd_size;
11460
11461         for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11462                 if (!(size & mask)) {
11463                         align = mask + 1;
11464                         break;
11465                 }
11466         }
11467
11468         action->dta_kind = desc->dtad_kind;
11469
11470         if ((action->dta_difo = dp) != NULL)
11471                 dtrace_difo_hold(dp);
11472
11473         rec->dtrd_action = action->dta_kind;
11474         rec->dtrd_arg = arg;
11475         rec->dtrd_uarg = desc->dtad_uarg;
11476         rec->dtrd_alignment = (uint16_t)align;
11477         rec->dtrd_format = format;
11478
11479         if ((last = ecb->dte_action_last) != NULL) {
11480                 ASSERT(ecb->dte_action != NULL);
11481                 action->dta_prev = last;
11482                 last->dta_next = action;
11483         } else {
11484                 ASSERT(ecb->dte_action == NULL);
11485                 ecb->dte_action = action;
11486         }
11487
11488         ecb->dte_action_last = action;
11489
11490         return (0);
11491 }
11492
11493 static void
11494 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11495 {
11496         dtrace_action_t *act = ecb->dte_action, *next;
11497         dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11498         dtrace_difo_t *dp;
11499         uint16_t format;
11500
11501         if (act != NULL && act->dta_refcnt > 1) {
11502                 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11503                 act->dta_refcnt--;
11504         } else {
11505                 for (; act != NULL; act = next) {
11506                         next = act->dta_next;
11507                         ASSERT(next != NULL || act == ecb->dte_action_last);
11508                         ASSERT(act->dta_refcnt == 1);
11509
11510                         if ((format = act->dta_rec.dtrd_format) != 0)
11511                                 dtrace_format_remove(ecb->dte_state, format);
11512
11513                         if ((dp = act->dta_difo) != NULL)
11514                                 dtrace_difo_release(dp, vstate);
11515
11516                         if (DTRACEACT_ISAGG(act->dta_kind)) {
11517                                 dtrace_ecb_aggregation_destroy(ecb, act);
11518                         } else {
11519                                 kmem_free(act, sizeof (dtrace_action_t));
11520                         }
11521                 }
11522         }
11523
11524         ecb->dte_action = NULL;
11525         ecb->dte_action_last = NULL;
11526         ecb->dte_size = 0;
11527 }
11528
11529 static void
11530 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11531 {
11532         /*
11533          * We disable the ECB by removing it from its probe.
11534          */
11535         dtrace_ecb_t *pecb, *prev = NULL;
11536         dtrace_probe_t *probe = ecb->dte_probe;
11537
11538         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11539
11540         if (probe == NULL) {
11541                 /*
11542                  * This is the NULL probe; there is nothing to disable.
11543                  */
11544                 return;
11545         }
11546
11547         for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11548                 if (pecb == ecb)
11549                         break;
11550                 prev = pecb;
11551         }
11552
11553         ASSERT(pecb != NULL);
11554
11555         if (prev == NULL) {
11556                 probe->dtpr_ecb = ecb->dte_next;
11557         } else {
11558                 prev->dte_next = ecb->dte_next;
11559         }
11560
11561         if (ecb == probe->dtpr_ecb_last) {
11562                 ASSERT(ecb->dte_next == NULL);
11563                 probe->dtpr_ecb_last = prev;
11564         }
11565
11566         probe->dtpr_provider->dtpv_ecb_count--;
11567         /*
11568          * The ECB has been disconnected from the probe; now sync to assure
11569          * that all CPUs have seen the change before returning.
11570          */
11571         dtrace_sync();
11572
11573         if (probe->dtpr_ecb == NULL) {
11574                 /*
11575                  * That was the last ECB on the probe; clear the predicate
11576                  * cache ID for the probe, disable it and sync one more time
11577                  * to assure that we'll never hit it again.
11578                  */
11579                 dtrace_provider_t *prov = probe->dtpr_provider;
11580
11581                 ASSERT(ecb->dte_next == NULL);
11582                 ASSERT(probe->dtpr_ecb_last == NULL);
11583                 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11584                 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11585                     probe->dtpr_id, probe->dtpr_arg);
11586                 dtrace_sync();
11587         } else {
11588                 /*
11589                  * There is at least one ECB remaining on the probe.  If there
11590                  * is _exactly_ one, set the probe's predicate cache ID to be
11591                  * the predicate cache ID of the remaining ECB.
11592                  */
11593                 ASSERT(probe->dtpr_ecb_last != NULL);
11594                 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11595
11596                 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11597                         dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11598
11599                         ASSERT(probe->dtpr_ecb->dte_next == NULL);
11600
11601                         if (p != NULL)
11602                                 probe->dtpr_predcache = p->dtp_cacheid;
11603                 }
11604
11605                 ecb->dte_next = NULL;
11606         }
11607 }
11608
11609 static void
11610 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11611 {
11612         dtrace_state_t *state = ecb->dte_state;
11613         dtrace_vstate_t *vstate = &state->dts_vstate;
11614         dtrace_predicate_t *pred;
11615         dtrace_epid_t epid = ecb->dte_epid;
11616
11617         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11618         ASSERT(ecb->dte_next == NULL);
11619         ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11620
11621         if ((pred = ecb->dte_predicate) != NULL)
11622                 dtrace_predicate_release(pred, vstate);
11623
11624         dtrace_ecb_action_remove(ecb);
11625
11626         ASSERT(state->dts_ecbs[epid - 1] == ecb);
11627         state->dts_ecbs[epid - 1] = NULL;
11628
11629         kmem_free(ecb, sizeof (dtrace_ecb_t));
11630 }
11631
11632 static dtrace_ecb_t *
11633 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11634     dtrace_enabling_t *enab)
11635 {
11636         dtrace_ecb_t *ecb;
11637         dtrace_predicate_t *pred;
11638         dtrace_actdesc_t *act;
11639         dtrace_provider_t *prov;
11640         dtrace_ecbdesc_t *desc = enab->dten_current;
11641
11642         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11643         ASSERT(state != NULL);
11644
11645         ecb = dtrace_ecb_add(state, probe);
11646         ecb->dte_uarg = desc->dted_uarg;
11647
11648         if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11649                 dtrace_predicate_hold(pred);
11650                 ecb->dte_predicate = pred;
11651         }
11652
11653         if (probe != NULL) {
11654                 /*
11655                  * If the provider shows more leg than the consumer is old
11656                  * enough to see, we need to enable the appropriate implicit
11657                  * predicate bits to prevent the ecb from activating at
11658                  * revealing times.
11659                  *
11660                  * Providers specifying DTRACE_PRIV_USER at register time
11661                  * are stating that they need the /proc-style privilege
11662                  * model to be enforced, and this is what DTRACE_COND_OWNER
11663                  * and DTRACE_COND_ZONEOWNER will then do at probe time.
11664                  */
11665                 prov = probe->dtpr_provider;
11666                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11667                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11668                         ecb->dte_cond |= DTRACE_COND_OWNER;
11669
11670                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11671                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11672                         ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11673
11674                 /*
11675                  * If the provider shows us kernel innards and the user
11676                  * is lacking sufficient privilege, enable the
11677                  * DTRACE_COND_USERMODE implicit predicate.
11678                  */
11679                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11680                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11681                         ecb->dte_cond |= DTRACE_COND_USERMODE;
11682         }
11683
11684         if (dtrace_ecb_create_cache != NULL) {
11685                 /*
11686                  * If we have a cached ecb, we'll use its action list instead
11687                  * of creating our own (saving both time and space).
11688                  */
11689                 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11690                 dtrace_action_t *act_if = cached->dte_action;
11691
11692                 if (act_if != NULL) {
11693                         ASSERT(act_if->dta_refcnt > 0);
11694                         act_if->dta_refcnt++;
11695                         ecb->dte_action = act_if;
11696                         ecb->dte_action_last = cached->dte_action_last;
11697                         ecb->dte_needed = cached->dte_needed;
11698                         ecb->dte_size = cached->dte_size;
11699                         ecb->dte_alignment = cached->dte_alignment;
11700                 }
11701
11702                 return (ecb);
11703         }
11704
11705         for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11706                 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11707                         dtrace_ecb_destroy(ecb);
11708                         return (NULL);
11709                 }
11710         }
11711
11712         if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
11713                 dtrace_ecb_destroy(ecb);
11714                 return (NULL);
11715         }
11716
11717         return (dtrace_ecb_create_cache = ecb);
11718 }
11719
11720 static int
11721 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg1, void *arg2)
11722 {
11723         dtrace_ecb_t *ecb;
11724         dtrace_enabling_t *enab = arg1;
11725         dtrace_ecbdesc_t *ep = arg2;
11726         dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11727
11728         ASSERT(state != NULL);
11729
11730         if (probe != NULL && ep != NULL && probe->dtpr_gen < ep->dted_probegen) {
11731                 /*
11732                  * This probe was created in a generation for which this
11733                  * enabling has previously created ECBs; we don't want to
11734                  * enable it again, so just kick out.
11735                  */
11736                 return (DTRACE_MATCH_NEXT);
11737         }
11738
11739         if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11740                 return (DTRACE_MATCH_DONE);
11741
11742         if (dtrace_ecb_enable(ecb) < 0)
11743                return (DTRACE_MATCH_FAIL);
11744
11745         return (DTRACE_MATCH_NEXT);
11746 }
11747
11748 static dtrace_ecb_t *
11749 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11750 {
11751         dtrace_ecb_t *ecb;
11752 #pragma unused(ecb) /* __APPLE__ */
11753
11754         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11755
11756         if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
11757                 return (NULL);
11758
11759         ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11760         ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11761
11762         return (state->dts_ecbs[id - 1]);
11763 }
11764
11765 static dtrace_aggregation_t *
11766 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11767 {
11768         dtrace_aggregation_t *agg;
11769 #pragma unused(agg) /* __APPLE__ */
11770
11771         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11772
11773         if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
11774                 return (NULL);
11775
11776         ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11777         ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11778             agg->dtag_id == id);
11779
11780         return (state->dts_aggregations[id - 1]);
11781 }
11782
11783 /*
11784  * DTrace Buffer Functions
11785  *
11786  * The following functions manipulate DTrace buffers.  Most of these functions
11787  * are called in the context of establishing or processing consumer state;
11788  * exceptions are explicitly noted.
11789  */
11790
11791 /*
11792  * Note:  called from cross call context.  This function switches the two
11793  * buffers on a given CPU.  The atomicity of this operation is assured by
11794  * disabling interrupts while the actual switch takes place; the disabling of
11795  * interrupts serializes the execution with any execution of dtrace_probe() on
11796  * the same CPU.
11797  */
11798 static void
11799 dtrace_buffer_switch(dtrace_buffer_t *buf)
11800 {
11801         caddr_t tomax = buf->dtb_tomax;
11802         caddr_t xamot = buf->dtb_xamot;
11803         dtrace_icookie_t cookie;
11804         hrtime_t now;
11805
11806         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11807         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11808
11809         cookie = dtrace_interrupt_disable();
11810         now = dtrace_gethrtime();
11811         buf->dtb_tomax = xamot;
11812         buf->dtb_xamot = tomax;
11813         buf->dtb_xamot_drops = buf->dtb_drops;
11814         buf->dtb_xamot_offset = buf->dtb_offset;
11815         buf->dtb_xamot_errors = buf->dtb_errors;
11816         buf->dtb_xamot_flags = buf->dtb_flags;
11817         buf->dtb_offset = 0;
11818         buf->dtb_drops = 0;
11819         buf->dtb_errors = 0;
11820         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11821         buf->dtb_interval = now - buf->dtb_switched;
11822         buf->dtb_switched = now;
11823         buf->dtb_cur_limit = buf->dtb_limit;
11824
11825         dtrace_interrupt_enable(cookie);
11826 }
11827
11828 /*
11829  * Note:  called from cross call context.  This function activates a buffer
11830  * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11831  * is guaranteed by the disabling of interrupts.
11832  */
11833 static void
11834 dtrace_buffer_activate(dtrace_state_t *state)
11835 {
11836         dtrace_buffer_t *buf;
11837         dtrace_icookie_t cookie = dtrace_interrupt_disable();
11838
11839         buf = &state->dts_buffer[CPU->cpu_id];
11840
11841         if (buf->dtb_tomax != NULL) {
11842                 /*
11843                  * We might like to assert that the buffer is marked inactive,
11844                  * but this isn't necessarily true:  the buffer for the CPU
11845                  * that processes the BEGIN probe has its buffer activated
11846                  * manually.  In this case, we take the (harmless) action
11847                  * re-clearing the bit INACTIVE bit.
11848                  */
11849                 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11850         }
11851
11852         dtrace_interrupt_enable(cookie);
11853 }
11854
11855 static int
11856 dtrace_buffer_canalloc(size_t size)
11857 {
11858         if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
11859                 return (B_FALSE);
11860         if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
11861                 return (B_FALSE);
11862
11863         return (B_TRUE);
11864 }
11865
11866 static int
11867 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t limit, size_t size, int flags,
11868     processorid_t cpu)
11869 {
11870         dtrace_cpu_t *cp;
11871         dtrace_buffer_t *buf;
11872         size_t size_before_alloc = dtrace_buffer_memory_inuse;
11873
11874         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11875         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11876
11877         if (size > (size_t)dtrace_nonroot_maxsize &&
11878             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11879                 return (EFBIG);
11880
11881         cp = cpu_list;
11882
11883         do {
11884                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11885                         continue;
11886
11887                 buf = &bufs[cp->cpu_id];
11888
11889                 /*
11890                  * If there is already a buffer allocated for this CPU, it
11891                  * is only possible that this is a DR event.  In this case,
11892                  * the buffer size must match our specified size.
11893                  */
11894                 if (buf->dtb_tomax != NULL) {
11895                         ASSERT(buf->dtb_size == size);
11896                         continue;
11897                 }
11898
11899                 ASSERT(buf->dtb_xamot == NULL);
11900
11901                 /* DTrace, please do not eat all the memory. */
11902                 if (dtrace_buffer_canalloc(size) == B_FALSE)
11903                         goto err;
11904                 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11905                         goto err;
11906                 dtrace_buffer_memory_inuse += size;
11907
11908                 /* Unsure that limit is always lower than size */
11909                 limit = limit == size ? limit - 1 : limit;
11910                 buf->dtb_cur_limit = limit;
11911                 buf->dtb_limit = limit;
11912                 buf->dtb_size = size;
11913                 buf->dtb_flags = flags;
11914                 buf->dtb_offset = 0;
11915                 buf->dtb_drops = 0;
11916
11917                 if (flags & DTRACEBUF_NOSWITCH)
11918                         continue;
11919
11920                 /* DTrace, please do not eat all the memory. */
11921                 if (dtrace_buffer_canalloc(size) == B_FALSE)
11922                         goto err;
11923                 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11924                         goto err;
11925                 dtrace_buffer_memory_inuse += size;
11926         } while ((cp = cp->cpu_next) != cpu_list);
11927
11928         ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
11929
11930         return (0);
11931
11932 err:
11933         cp = cpu_list;
11934
11935         do {
11936                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11937                         continue;
11938
11939                 buf = &bufs[cp->cpu_id];
11940
11941                 if (buf->dtb_xamot != NULL) {
11942                         ASSERT(buf->dtb_tomax != NULL);
11943                         ASSERT(buf->dtb_size == size);
11944                         kmem_free(buf->dtb_xamot, size);
11945                 }
11946
11947                 if (buf->dtb_tomax != NULL) {
11948                         ASSERT(buf->dtb_size == size);
11949                         kmem_free(buf->dtb_tomax, size);
11950                 }
11951
11952                 buf->dtb_tomax = NULL;
11953                 buf->dtb_xamot = NULL;
11954                 buf->dtb_size = 0;
11955         } while ((cp = cp->cpu_next) != cpu_list);
11956
11957         /* Restore the size saved before allocating memory */
11958         dtrace_buffer_memory_inuse = size_before_alloc;
11959
11960         return (ENOMEM);
11961 }
11962
11963 /*
11964  * Note:  called from probe context.  This function just increments the drop
11965  * count on a buffer.  It has been made a function to allow for the
11966  * possibility of understanding the source of mysterious drop counts.  (A
11967  * problem for which one may be particularly disappointed that DTrace cannot
11968  * be used to understand DTrace.)
11969  */
11970 static void
11971 dtrace_buffer_drop(dtrace_buffer_t *buf)
11972 {
11973         buf->dtb_drops++;
11974 }
11975
11976 /*
11977  * Note:  called from probe context.  This function is called to reserve space
11978  * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
11979  * mstate.  Returns the new offset in the buffer, or a negative value if an
11980  * error has occurred.
11981  */
11982 static intptr_t
11983 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11984     dtrace_state_t *state, dtrace_mstate_t *mstate)
11985 {
11986         intptr_t offs = buf->dtb_offset, soffs;
11987         intptr_t woffs;
11988         caddr_t tomax;
11989         size_t total_off;
11990
11991         if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11992                 return (-1);
11993
11994         if ((tomax = buf->dtb_tomax) == NULL) {
11995                 dtrace_buffer_drop(buf);
11996                 return (-1);
11997         }
11998
11999         if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12000                 while (offs & (align - 1)) {
12001                         /*
12002                          * Assert that our alignment is off by a number which
12003                          * is itself sizeof (uint32_t) aligned.
12004                          */
12005                         ASSERT(!((align - (offs & (align - 1))) &
12006                             (sizeof (uint32_t) - 1)));
12007                         DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12008                         offs += sizeof (uint32_t);
12009                 }
12010
12011                 if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) {
12012                         if (buf->dtb_cur_limit == buf->dtb_limit) {
12013                                 buf->dtb_cur_limit = buf->dtb_size;
12014
12015                                 os_atomic_inc(&state->dts_buf_over_limit, relaxed);
12016                                 /**
12017                                  * Set an AST on the current processor
12018                                  * so that we can wake up the process
12019                                  * outside of probe context, when we know
12020                                  * it is safe to do so
12021                                  */
12022                                 minor_t minor = getminor(state->dts_dev);
12023                                 ASSERT(minor < 32);
12024
12025                                 os_atomic_or(&dtrace_wake_clients, 1 << minor, relaxed);
12026                                 ast_dtrace_on();
12027                         }
12028                         if ((uint64_t)soffs > buf->dtb_size) {
12029                                 dtrace_buffer_drop(buf);
12030                                 return (-1);
12031                         }
12032                 }
12033
12034                 if (mstate == NULL)
12035                         return (offs);
12036
12037                 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12038                 mstate->dtms_scratch_size = buf->dtb_size - soffs;
12039                 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12040
12041                 return (offs);
12042         }
12043
12044         if (buf->dtb_flags & DTRACEBUF_FILL) {
12045                 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12046                     (buf->dtb_flags & DTRACEBUF_FULL))
12047                         return (-1);
12048                 goto out;
12049         }
12050
12051         total_off = needed + (offs & (align - 1));
12052
12053         /*
12054          * For a ring buffer, life is quite a bit more complicated.  Before
12055          * we can store any padding, we need to adjust our wrapping offset.
12056          * (If we've never before wrapped or we're not about to, no adjustment
12057          * is required.)
12058          */
12059         if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12060             offs + total_off > buf->dtb_size) {
12061                 woffs = buf->dtb_xamot_offset;
12062
12063                 if (offs + total_off > buf->dtb_size) {
12064                         /*
12065                          * We can't fit in the end of the buffer.  First, a
12066                          * sanity check that we can fit in the buffer at all.
12067                          */
12068                         if (total_off > buf->dtb_size) {
12069                                 dtrace_buffer_drop(buf);
12070                                 return (-1);
12071                         }
12072
12073                         /*
12074                          * We're going to be storing at the top of the buffer,
12075                          * so now we need to deal with the wrapped offset.  We
12076                          * only reset our wrapped offset to 0 if it is
12077                          * currently greater than the current offset.  If it
12078                          * is less than the current offset, it is because a
12079                          * previous allocation induced a wrap -- but the
12080                          * allocation didn't subsequently take the space due
12081                          * to an error or false predicate evaluation.  In this
12082                          * case, we'll just leave the wrapped offset alone: if
12083                          * the wrapped offset hasn't been advanced far enough
12084                          * for this allocation, it will be adjusted in the
12085                          * lower loop.
12086                          */
12087                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12088                                 if (woffs >= offs)
12089                                         woffs = 0;
12090                         } else {
12091                                 woffs = 0;
12092                         }
12093
12094                         /*
12095                          * Now we know that we're going to be storing to the
12096                          * top of the buffer and that there is room for us
12097                          * there.  We need to clear the buffer from the current
12098                          * offset to the end (there may be old gunk there).
12099                          */
12100                         while ((uint64_t)offs < buf->dtb_size)
12101                                 tomax[offs++] = 0;
12102
12103                         /*
12104                          * We need to set our offset to zero.  And because we
12105                          * are wrapping, we need to set the bit indicating as
12106                          * much.  We can also adjust our needed space back
12107                          * down to the space required by the ECB -- we know
12108                          * that the top of the buffer is aligned.
12109                          */
12110                         offs = 0;
12111                         total_off = needed;
12112                         buf->dtb_flags |= DTRACEBUF_WRAPPED;
12113                 } else {
12114                         /*
12115                          * There is room for us in the buffer, so we simply
12116                          * need to check the wrapped offset.
12117                          */
12118                         if (woffs < offs) {
12119                                 /*
12120                                  * The wrapped offset is less than the offset.
12121                                  * This can happen if we allocated buffer space
12122                                  * that induced a wrap, but then we didn't
12123                                  * subsequently take the space due to an error
12124                                  * or false predicate evaluation.  This is
12125                                  * okay; we know that _this_ allocation isn't
12126                                  * going to induce a wrap.  We still can't
12127                                  * reset the wrapped offset to be zero,
12128                                  * however: the space may have been trashed in
12129                                  * the previous failed probe attempt.  But at
12130                                  * least the wrapped offset doesn't need to
12131                                  * be adjusted at all...
12132                                  */
12133                                 goto out;
12134                         }
12135                 }
12136
12137                 while (offs + total_off > (size_t)woffs) {
12138                         dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12139                         size_t size;
12140
12141                         if (epid == DTRACE_EPIDNONE) {
12142                                 size = sizeof (uint32_t);
12143                         } else {
12144                                 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
12145                                 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12146
12147                                 size = state->dts_ecbs[epid - 1]->dte_size;
12148                         }
12149
12150                         ASSERT(woffs + size <= buf->dtb_size);
12151                         ASSERT(size != 0);
12152
12153                         if (woffs + size == buf->dtb_size) {
12154                                 /*
12155                                  * We've reached the end of the buffer; we want
12156                                  * to set the wrapped offset to 0 and break
12157                                  * out.  However, if the offs is 0, then we're
12158                                  * in a strange edge-condition:  the amount of
12159                                  * space that we want to reserve plus the size
12160                                  * of the record that we're overwriting is
12161                                  * greater than the size of the buffer.  This
12162                                  * is problematic because if we reserve the
12163                                  * space but subsequently don't consume it (due
12164                                  * to a failed predicate or error) the wrapped
12165                                  * offset will be 0 -- yet the EPID at offset 0
12166                                  * will not be committed.  This situation is
12167                                  * relatively easy to deal with:  if we're in
12168                                  * this case, the buffer is indistinguishable
12169                                  * from one that hasn't wrapped; we need only
12170                                  * finish the job by clearing the wrapped bit,
12171                                  * explicitly setting the offset to be 0, and
12172                                  * zero'ing out the old data in the buffer.
12173                                  */
12174                                 if (offs == 0) {
12175                                         buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12176                                         buf->dtb_offset = 0;
12177                                         woffs = total_off;
12178
12179                                         while ((uint64_t)woffs < buf->dtb_size)
12180                                                 tomax[woffs++] = 0;
12181                                 }
12182
12183                                 woffs = 0;
12184                                 break;
12185                         }
12186
12187                         woffs += size;
12188                 }
12189
12190                 /*
12191                  * We have a wrapped offset.  It may be that the wrapped offset
12192                  * has become zero -- that's okay.
12193                  */
12194                 buf->dtb_xamot_offset = woffs;
12195         }
12196
12197 out:
12198         /*
12199          * Now we can plow the buffer with any necessary padding.
12200          */
12201         while (offs & (align - 1)) {
12202                 /*
12203                  * Assert that our alignment is off by a number which
12204                  * is itself sizeof (uint32_t) aligned.
12205                  */
12206                 ASSERT(!((align - (offs & (align - 1))) &
12207                     (sizeof (uint32_t) - 1)));
12208                 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12209                 offs += sizeof (uint32_t);
12210         }
12211
12212         if (buf->dtb_flags & DTRACEBUF_FILL) {
12213                 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12214                         buf->dtb_flags |= DTRACEBUF_FULL;
12215                         return (-1);
12216                 }
12217         }
12218
12219         if (mstate == NULL)
12220                 return (offs);
12221
12222         /*
12223          * For ring buffers and fill buffers, the scratch space is always
12224          * the inactive buffer.
12225          */
12226         mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12227         mstate->dtms_scratch_size = buf->dtb_size;
12228         mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12229
12230         return (offs);
12231 }
12232
12233 static void
12234 dtrace_buffer_polish(dtrace_buffer_t *buf)
12235 {
12236         ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12237         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12238
12239         if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12240                 return;
12241
12242         /*
12243          * We need to polish the ring buffer.  There are three cases:
12244          *
12245          * - The first (and presumably most common) is that there is no gap
12246          *   between the buffer offset and the wrapped offset.  In this case,
12247          *   there is nothing in the buffer that isn't valid data; we can
12248          *   mark the buffer as polished and return.
12249          *
12250          * - The second (less common than the first but still more common
12251          *   than the third) is that there is a gap between the buffer offset
12252          *   and the wrapped offset, and the wrapped offset is larger than the
12253          *   buffer offset.  This can happen because of an alignment issue, or
12254          *   can happen because of a call to dtrace_buffer_reserve() that
12255          *   didn't subsequently consume the buffer space.  In this case,
12256          *   we need to zero the data from the buffer offset to the wrapped
12257          *   offset.
12258          *
12259          * - The third (and least common) is that there is a gap between the
12260          *   buffer offset and the wrapped offset, but the wrapped offset is
12261          *   _less_ than the buffer offset.  This can only happen because a
12262          *   call to dtrace_buffer_reserve() induced a wrap, but the space
12263          *   was not subsequently consumed.  In this case, we need to zero the
12264          *   space from the offset to the end of the buffer _and_ from the
12265          *   top of the buffer to the wrapped offset.
12266          */
12267         if (buf->dtb_offset < buf->dtb_xamot_offset) {
12268                 bzero(buf->dtb_tomax + buf->dtb_offset,
12269                     buf->dtb_xamot_offset - buf->dtb_offset);
12270         }
12271
12272         if (buf->dtb_offset > buf->dtb_xamot_offset) {
12273                 bzero(buf->dtb_tomax + buf->dtb_offset,
12274                     buf->dtb_size - buf->dtb_offset);
12275                 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12276         }
12277 }
12278
12279 static void
12280 dtrace_buffer_free(dtrace_buffer_t *bufs)
12281 {
12282         int i;
12283
12284         for (i = 0; i < (int)NCPU; i++) {
12285                 dtrace_buffer_t *buf = &bufs[i];
12286
12287                 if (buf->dtb_tomax == NULL) {
12288                         ASSERT(buf->dtb_xamot == NULL);
12289                         ASSERT(buf->dtb_size == 0);
12290                         continue;
12291                 }
12292
12293                 if (buf->dtb_xamot != NULL) {
12294                         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12295                         kmem_free(buf->dtb_xamot, buf->dtb_size);
12296
12297                         ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12298                         dtrace_buffer_memory_inuse -= buf->dtb_size;
12299                 }
12300
12301                 kmem_free(buf->dtb_tomax, buf->dtb_size);
12302                 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12303                 dtrace_buffer_memory_inuse -= buf->dtb_size;
12304
12305                 buf->dtb_size = 0;
12306                 buf->dtb_tomax = NULL;
12307                 buf->dtb_xamot = NULL;
12308         }
12309 }
12310
12311 /*
12312  * DTrace Enabling Functions
12313  */
12314 static dtrace_enabling_t *
12315 dtrace_enabling_create(dtrace_vstate_t *vstate)
12316 {
12317         dtrace_enabling_t *enab;
12318
12319         enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12320         enab->dten_vstate = vstate;
12321
12322         return (enab);
12323 }
12324
12325 static void
12326 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12327 {
12328         dtrace_ecbdesc_t **ndesc;
12329         size_t osize, nsize;
12330
12331         /*
12332          * We can't add to enablings after we've enabled them, or after we've
12333          * retained them.
12334          */
12335         ASSERT(enab->dten_probegen == 0);
12336         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12337
12338         /* APPLE NOTE: this protects against gcc 4.0 botch on x86 */
12339         if (ecb == NULL) return;
12340
12341         if (enab->dten_ndesc < enab->dten_maxdesc) {
12342                 enab->dten_desc[enab->dten_ndesc++] = ecb;
12343                 return;
12344         }
12345
12346         osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12347
12348         if (enab->dten_maxdesc == 0) {
12349                 enab->dten_maxdesc = 1;
12350         } else {
12351                 enab->dten_maxdesc <<= 1;
12352         }
12353
12354         ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12355
12356         nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12357         ndesc = kmem_zalloc(nsize, KM_SLEEP);
12358         bcopy(enab->dten_desc, ndesc, osize);
12359         kmem_free(enab->dten_desc, osize);
12360
12361         enab->dten_desc = ndesc;
12362         enab->dten_desc[enab->dten_ndesc++] = ecb;
12363 }
12364
12365 static void
12366 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12367     dtrace_probedesc_t *pd)
12368 {
12369         dtrace_ecbdesc_t *new;
12370         dtrace_predicate_t *pred;
12371         dtrace_actdesc_t *act;
12372
12373         /*
12374          * We're going to create a new ECB description that matches the
12375          * specified ECB in every way, but has the specified probe description.
12376          */
12377         new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12378
12379         if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12380                 dtrace_predicate_hold(pred);
12381
12382         for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12383                 dtrace_actdesc_hold(act);
12384
12385         new->dted_action = ecb->dted_action;
12386         new->dted_pred = ecb->dted_pred;
12387         new->dted_probe = *pd;
12388         new->dted_uarg = ecb->dted_uarg;
12389
12390         dtrace_enabling_add(enab, new);
12391 }
12392
12393 static void
12394 dtrace_enabling_dump(dtrace_enabling_t *enab)
12395 {
12396         int i;
12397
12398         for (i = 0; i < enab->dten_ndesc; i++) {
12399                 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12400
12401                 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12402                     desc->dtpd_provider, desc->dtpd_mod,
12403                     desc->dtpd_func, desc->dtpd_name);
12404         }
12405 }
12406
12407 static void
12408 dtrace_enabling_destroy(dtrace_enabling_t *enab)
12409 {
12410         int i;
12411         dtrace_ecbdesc_t *ep;
12412         dtrace_vstate_t *vstate = enab->dten_vstate;
12413
12414         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12415
12416         for (i = 0; i < enab->dten_ndesc; i++) {
12417                 dtrace_actdesc_t *act, *next;
12418                 dtrace_predicate_t *pred;
12419
12420                 ep = enab->dten_desc[i];
12421
12422                 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12423                         dtrace_predicate_release(pred, vstate);
12424
12425                 for (act = ep->dted_action; act != NULL; act = next) {
12426                         next = act->dtad_next;
12427                         dtrace_actdesc_release(act, vstate);
12428                 }
12429
12430                 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12431         }
12432
12433         kmem_free(enab->dten_desc,
12434             enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12435
12436         /*
12437          * If this was a retained enabling, decrement the dts_nretained count
12438          * and take it off of the dtrace_retained list.
12439          */
12440         if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12441             dtrace_retained == enab) {
12442                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12443                 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12444                 enab->dten_vstate->dtvs_state->dts_nretained--;
12445                 dtrace_retained_gen++;
12446         }
12447
12448         if (enab->dten_prev == NULL) {
12449                 if (dtrace_retained == enab) {
12450                         dtrace_retained = enab->dten_next;
12451
12452                         if (dtrace_retained != NULL)
12453                                 dtrace_retained->dten_prev = NULL;
12454                 }
12455         } else {
12456                 ASSERT(enab != dtrace_retained);
12457                 ASSERT(dtrace_retained != NULL);
12458                 enab->dten_prev->dten_next = enab->dten_next;
12459         }
12460
12461         if (enab->dten_next != NULL) {
12462                 ASSERT(dtrace_retained != NULL);
12463                 enab->dten_next->dten_prev = enab->dten_prev;
12464         }
12465
12466         kmem_free(enab, sizeof (dtrace_enabling_t));
12467 }
12468
12469 static int
12470 dtrace_enabling_retain(dtrace_enabling_t *enab)
12471 {
12472         dtrace_state_t *state;
12473
12474         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12475         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12476         ASSERT(enab->dten_vstate != NULL);
12477
12478         state = enab->dten_vstate->dtvs_state;
12479         ASSERT(state != NULL);
12480
12481         /*
12482          * We only allow each state to retain dtrace_retain_max enablings.
12483          */
12484         if (state->dts_nretained >= dtrace_retain_max)
12485                 return (ENOSPC);
12486
12487         state->dts_nretained++;
12488         dtrace_retained_gen++;
12489
12490         if (dtrace_retained == NULL) {
12491                 dtrace_retained = enab;
12492                 return (0);
12493         }
12494
12495         enab->dten_next = dtrace_retained;
12496         dtrace_retained->dten_prev = enab;
12497         dtrace_retained = enab;
12498
12499         return (0);
12500 }
12501
12502 static int
12503 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12504     dtrace_probedesc_t *create)
12505 {
12506         dtrace_enabling_t *new, *enab;
12507         int found = 0, err = ENOENT;
12508
12509         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12510         ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12511         ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12512         ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12513         ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12514
12515         new = dtrace_enabling_create(&state->dts_vstate);
12516
12517         /*
12518          * Iterate over all retained enablings, looking for enablings that
12519          * match the specified state.
12520          */
12521         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12522                 int i;
12523
12524                 /*
12525                  * dtvs_state can only be NULL for helper enablings -- and
12526                  * helper enablings can't be retained.
12527                  */
12528                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12529
12530                 if (enab->dten_vstate->dtvs_state != state)
12531                         continue;
12532
12533                 /*
12534                  * Now iterate over each probe description; we're looking for
12535                  * an exact match to the specified probe description.
12536                  */
12537                 for (i = 0; i < enab->dten_ndesc; i++) {
12538                         dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12539                         dtrace_probedesc_t *pd = &ep->dted_probe;
12540
12541                         /* APPLE NOTE: Darwin employs size bounded string operation. */
12542                         if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
12543                                 continue;
12544
12545                         if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
12546                                 continue;
12547
12548                         if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
12549                                 continue;
12550
12551                         if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
12552                                 continue;
12553
12554                         /*
12555                          * We have a winning probe!  Add it to our growing
12556                          * enabling.
12557                          */
12558                         found = 1;
12559                         dtrace_enabling_addlike(new, ep, create);
12560                 }
12561         }
12562
12563         if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12564                 dtrace_enabling_destroy(new);
12565                 return (err);
12566         }
12567
12568         return (0);
12569 }
12570
12571 static void
12572 dtrace_enabling_retract(dtrace_state_t *state)
12573 {
12574         dtrace_enabling_t *enab, *next;
12575
12576         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12577
12578         /*
12579          * Iterate over all retained enablings, destroy the enablings retained
12580          * for the specified state.
12581          */
12582         for (enab = dtrace_retained; enab != NULL; enab = next) {
12583                 next = enab->dten_next;
12584
12585                 /*
12586                  * dtvs_state can only be NULL for helper enablings -- and
12587                  * helper enablings can't be retained.
12588                  */
12589                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12590
12591                 if (enab->dten_vstate->dtvs_state == state) {
12592                         ASSERT(state->dts_nretained > 0);
12593                         dtrace_enabling_destroy(enab);
12594                 }
12595         }
12596
12597         ASSERT(state->dts_nretained == 0);
12598 }
12599
12600 static int
12601 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched, dtrace_match_cond_t *cond)
12602 {
12603         int i = 0;
12604         int total_matched = 0, matched = 0;
12605
12606         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12607         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12608
12609         for (i = 0; i < enab->dten_ndesc; i++) {
12610                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12611
12612                 enab->dten_current = ep;
12613                 enab->dten_error = 0;
12614
12615                 /**
12616                  * Before doing a dtrace_probe_enable, which is really
12617                  * expensive, check that this enabling matches the matching precondition
12618                  * if we have one
12619                  */
12620                 if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == 0)) {
12621                         continue;
12622                 }
12623                 /*
12624                  * If a provider failed to enable a probe then get out and
12625                  * let the consumer know we failed.
12626                  */
12627                 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab, ep)) < 0)
12628                         return (EBUSY);
12629
12630                 total_matched += matched;
12631
12632                 if (enab->dten_error != 0) {
12633                         /*
12634                          * If we get an error half-way through enabling the
12635                          * probes, we kick out -- perhaps with some number of
12636                          * them enabled.  Leaving enabled probes enabled may
12637                          * be slightly confusing for user-level, but we expect
12638                          * that no one will attempt to actually drive on in
12639                          * the face of such errors.  If this is an anonymous
12640                          * enabling (indicated with a NULL nmatched pointer),
12641                          * we cmn_err() a message.  We aren't expecting to
12642                          * get such an error -- such as it can exist at all,
12643                          * it would be a result of corrupted DOF in the driver
12644                          * properties.
12645                          */
12646                         if (nmatched == NULL) {
12647                                 cmn_err(CE_WARN, "dtrace_enabling_match() "
12648                                     "error on %p: %d", (void *)ep,
12649                                     enab->dten_error);
12650                         }
12651
12652                         return (enab->dten_error);
12653                 }
12654
12655                 ep->dted_probegen = dtrace_probegen;
12656         }
12657
12658         if (nmatched != NULL)
12659                 *nmatched = total_matched;
12660
12661         return (0);
12662 }
12663
12664 static void
12665 dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond)
12666 {
12667         dtrace_enabling_t *enab;
12668
12669         lck_mtx_lock(&cpu_lock);
12670         lck_mtx_lock(&dtrace_lock);
12671
12672         /*
12673          * Iterate over all retained enablings to see if any probes match
12674          * against them.  We only perform this operation on enablings for which
12675          * we have sufficient permissions by virtue of being in the global zone
12676          * or in the same zone as the DTrace client.  Because we can be called
12677          * after dtrace_detach() has been called, we cannot assert that there
12678          * are retained enablings.  We can safely load from dtrace_retained,
12679          * however:  the taskq_destroy() at the end of dtrace_detach() will
12680          * block pending our completion.
12681          */
12682
12683         /*
12684          * Darwin doesn't do zones.
12685          * Behave as if always in "global" zone."
12686          */
12687         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12688                 (void) dtrace_enabling_match(enab, NULL, cond);
12689         }
12690
12691         lck_mtx_unlock(&dtrace_lock);
12692         lck_mtx_unlock(&cpu_lock);
12693
12694 }
12695
12696 static void
12697 dtrace_enabling_matchall(void)
12698 {
12699         dtrace_enabling_matchall_with_cond(NULL);
12700 }
12701
12702
12703
12704 /*
12705  * If an enabling is to be enabled without having matched probes (that is, if
12706  * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12707  * enabling must be _primed_ by creating an ECB for every ECB description.
12708  * This must be done to assure that we know the number of speculations, the
12709  * number of aggregations, the minimum buffer size needed, etc. before we
12710  * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12711  * enabling any probes, we create ECBs for every ECB decription, but with a
12712  * NULL probe -- which is exactly what this function does.
12713  */
12714 static void
12715 dtrace_enabling_prime(dtrace_state_t *state)
12716 {
12717         dtrace_enabling_t *enab;
12718         int i;
12719
12720         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12721                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12722
12723                 if (enab->dten_vstate->dtvs_state != state)
12724                         continue;
12725
12726                 /*
12727                  * We don't want to prime an enabling more than once, lest
12728                  * we allow a malicious user to induce resource exhaustion.
12729                  * (The ECBs that result from priming an enabling aren't
12730                  * leaked -- but they also aren't deallocated until the
12731                  * consumer state is destroyed.)
12732                  */
12733                 if (enab->dten_primed)
12734                         continue;
12735
12736                 for (i = 0; i < enab->dten_ndesc; i++) {
12737                         enab->dten_current = enab->dten_desc[i];
12738                         (void) dtrace_probe_enable(NULL, enab, NULL);
12739                 }
12740
12741                 enab->dten_primed = 1;
12742         }
12743 }
12744
12745 /*
12746  * Called to indicate that probes should be provided due to retained
12747  * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12748  * must take an initial lap through the enabling calling the dtps_provide()
12749  * entry point explicitly to allow for autocreated probes.
12750  */
12751 static void
12752 dtrace_enabling_provide(dtrace_provider_t *prv)
12753 {
12754         int i, all = 0;
12755         dtrace_probedesc_t desc;
12756         dtrace_genid_t gen;
12757
12758         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12759         LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12760
12761         if (prv == NULL) {
12762                 all = 1;
12763                 prv = dtrace_provider;
12764         }
12765
12766         do {
12767                 dtrace_enabling_t *enab;
12768                 void *parg = prv->dtpv_arg;
12769
12770 retry:
12771                 gen = dtrace_retained_gen;
12772                 for (enab = dtrace_retained; enab != NULL;
12773                     enab = enab->dten_next) {
12774                         for (i = 0; i < enab->dten_ndesc; i++) {
12775                                 desc = enab->dten_desc[i]->dted_probe;
12776                                 lck_mtx_unlock(&dtrace_lock);
12777                                 prv->dtpv_pops.dtps_provide(parg, &desc);
12778                                 lck_mtx_lock(&dtrace_lock);
12779                                 /*
12780                                  * Process the retained enablings again if
12781                                  * they have changed while we weren't holding
12782                                  * dtrace_lock.
12783                                  */
12784                                 if (gen != dtrace_retained_gen)
12785                                         goto retry;
12786                         }
12787                 }
12788         } while (all && (prv = prv->dtpv_next) != NULL);
12789
12790         lck_mtx_unlock(&dtrace_lock);
12791         dtrace_probe_provide(NULL, all ? NULL : prv);
12792         lck_mtx_lock(&dtrace_lock);
12793 }
12794
12795 /*
12796  * DTrace DOF Functions
12797  */
12798 /*ARGSUSED*/
12799 static void
12800 dtrace_dof_error(dof_hdr_t *dof, const char *str)
12801 {
12802 #pragma unused(dof) /* __APPLE__ */
12803         if (dtrace_err_verbose)
12804                 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12805
12806 #ifdef DTRACE_ERRDEBUG
12807         dtrace_errdebug(str);
12808 #endif
12809 }
12810
12811 /*
12812  * Create DOF out of a currently enabled state.  Right now, we only create
12813  * DOF containing the run-time options -- but this could be expanded to create
12814  * complete DOF representing the enabled state.
12815  */
12816 static dof_hdr_t *
12817 dtrace_dof_create(dtrace_state_t *state)
12818 {
12819         dof_hdr_t *dof;
12820         dof_sec_t *sec;
12821         dof_optdesc_t *opt;
12822         int i, len = sizeof (dof_hdr_t) +
12823             roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12824             sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12825
12826         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12827
12828         dof = kmem_zalloc_aligned(len, 8, KM_SLEEP);
12829         dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12830         dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12831         dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12832         dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12833
12834         dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12835         dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12836         dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12837         dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12838         dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12839         dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12840
12841         dof->dofh_flags = 0;
12842         dof->dofh_hdrsize = sizeof (dof_hdr_t);
12843         dof->dofh_secsize = sizeof (dof_sec_t);
12844         dof->dofh_secnum = 1;   /* only DOF_SECT_OPTDESC */
12845         dof->dofh_secoff = sizeof (dof_hdr_t);
12846         dof->dofh_loadsz = len;
12847         dof->dofh_filesz = len;
12848         dof->dofh_pad = 0;
12849
12850         /*
12851          * Fill in the option section header...
12852          */
12853         sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12854         sec->dofs_type = DOF_SECT_OPTDESC;
12855         sec->dofs_align = sizeof (uint64_t);
12856         sec->dofs_flags = DOF_SECF_LOAD;
12857         sec->dofs_entsize = sizeof (dof_optdesc_t);
12858
12859         opt = (dof_optdesc_t *)((uintptr_t)sec +
12860             roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12861
12862         sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12863         sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12864
12865         for (i = 0; i < DTRACEOPT_MAX; i++) {
12866                 opt[i].dofo_option = i;
12867                 opt[i].dofo_strtab = DOF_SECIDX_NONE;
12868                 opt[i].dofo_value = state->dts_options[i];
12869         }
12870
12871         return (dof);
12872 }
12873
12874 static dof_hdr_t *
12875 dtrace_dof_copyin(user_addr_t uarg, int *errp)
12876 {
12877         dof_hdr_t hdr, *dof;
12878
12879         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12880
12881         /*
12882          * First, we're going to copyin() the sizeof (dof_hdr_t).
12883          */
12884         if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
12885                 dtrace_dof_error(NULL, "failed to copyin DOF header");
12886                 *errp = EFAULT;
12887                 return (NULL);
12888         }
12889
12890         /*
12891          * Now we'll allocate the entire DOF and copy it in -- provided
12892          * that the length isn't outrageous.
12893          */
12894         if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12895                 dtrace_dof_error(&hdr, "load size exceeds maximum");
12896                 *errp = E2BIG;
12897                 return (NULL);
12898         }
12899
12900         if (hdr.dofh_loadsz < sizeof (hdr)) {
12901                 dtrace_dof_error(&hdr, "invalid load size");
12902                 *errp = EINVAL;
12903                 return (NULL);
12904         }
12905
12906         dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12907
12908         if (copyin(uarg, dof, hdr.dofh_loadsz) != 0  ||
12909           dof->dofh_loadsz != hdr.dofh_loadsz) {
12910             kmem_free_aligned(dof, hdr.dofh_loadsz);
12911             *errp = EFAULT;
12912             return (NULL);
12913         }
12914
12915         return (dof);
12916 }
12917
12918 static dof_hdr_t *
12919 dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
12920 {
12921         dof_hdr_t hdr, *dof;
12922
12923         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12924
12925         /*
12926          * First, we're going to copyin() the sizeof (dof_hdr_t).
12927          */
12928         if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
12929                 dtrace_dof_error(NULL, "failed to copyin DOF header");
12930                 *errp = EFAULT;
12931                 return (NULL);
12932         }
12933
12934         /*
12935          * Now we'll allocate the entire DOF and copy it in -- provided
12936          * that the length isn't outrageous.
12937          */
12938         if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12939                 dtrace_dof_error(&hdr, "load size exceeds maximum");
12940                 *errp = E2BIG;
12941                 return (NULL);
12942         }
12943
12944         if (hdr.dofh_loadsz < sizeof (hdr)) {
12945                 dtrace_dof_error(&hdr, "invalid load size");
12946                 *errp = EINVAL;
12947                 return (NULL);
12948         }
12949
12950         dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12951
12952         if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
12953                 kmem_free_aligned(dof, hdr.dofh_loadsz);
12954                 *errp = EFAULT;
12955                 return (NULL);
12956         }
12957
12958         return (dof);
12959 }
12960
12961 static void
12962 dtrace_dof_destroy(dof_hdr_t *dof)
12963 {
12964         kmem_free_aligned(dof, dof->dofh_loadsz);
12965 }
12966
12967 static dof_hdr_t *
12968 dtrace_dof_property(const char *name)
12969 {
12970         unsigned int len = 0;
12971         dof_hdr_t *dof;
12972
12973         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
12974                 return NULL;
12975         }
12976
12977         if (!PEReadNVRAMProperty(name, NULL, &len)) {
12978                 return NULL;
12979         }
12980
12981         dof = kmem_alloc_aligned(len, 8, KM_SLEEP);
12982
12983         if (!PEReadNVRAMProperty(name, dof, &len)) {
12984                 dtrace_dof_destroy(dof);
12985                 dtrace_dof_error(NULL, "unreadable DOF");
12986                 return NULL;
12987         }
12988
12989         if (len < sizeof (dof_hdr_t)) {
12990                 dtrace_dof_destroy(dof);
12991                 dtrace_dof_error(NULL, "truncated header");
12992                 return (NULL);
12993         }
12994
12995         if (len < dof->dofh_loadsz) {
12996                 dtrace_dof_destroy(dof);
12997                 dtrace_dof_error(NULL, "truncated DOF");
12998                 return (NULL);
12999         }
13000
13001         if (len != dof->dofh_loadsz) {
13002                 dtrace_dof_destroy(dof);
13003                 dtrace_dof_error(NULL, "invalid DOF size");
13004                 return (NULL);
13005         }
13006
13007         if (dof->dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13008                 dtrace_dof_destroy(dof);
13009                 dtrace_dof_error(NULL, "oversized DOF");
13010                 return (NULL);
13011         }
13012
13013         return (dof);
13014 }
13015
13016 /*
13017  * Return the dof_sec_t pointer corresponding to a given section index.  If the
13018  * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
13019  * a type other than DOF_SECT_NONE is specified, the header is checked against
13020  * this type and NULL is returned if the types do not match.
13021  */
13022 static dof_sec_t *
13023 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13024 {
13025         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13026             ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13027
13028         if (i >= dof->dofh_secnum) {
13029                 dtrace_dof_error(dof, "referenced section index is invalid");
13030                 return (NULL);
13031         }
13032
13033         if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13034                 dtrace_dof_error(dof, "referenced section is not loadable");
13035                 return (NULL);
13036         }
13037
13038         if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13039                 dtrace_dof_error(dof, "referenced section is the wrong type");
13040                 return (NULL);
13041         }
13042
13043         return (sec);
13044 }
13045
13046 static dtrace_probedesc_t *
13047 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13048 {
13049         dof_probedesc_t *probe;
13050         dof_sec_t *strtab;
13051         uintptr_t daddr = (uintptr_t)dof;
13052         uintptr_t str;
13053         size_t size;
13054
13055         if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13056                 dtrace_dof_error(dof, "invalid probe section");
13057                 return (NULL);
13058         }
13059
13060         if (sec->dofs_align != sizeof (dof_secidx_t)) {
13061                 dtrace_dof_error(dof, "bad alignment in probe description");
13062                 return (NULL);
13063         }
13064
13065         if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13066                 dtrace_dof_error(dof, "truncated probe description");
13067                 return (NULL);
13068         }
13069
13070         probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13071         strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13072
13073         if (strtab == NULL)
13074                 return (NULL);
13075
13076         str = daddr + strtab->dofs_offset;
13077         size = strtab->dofs_size;
13078
13079         if (probe->dofp_provider >= strtab->dofs_size) {
13080                 dtrace_dof_error(dof, "corrupt probe provider");
13081                 return (NULL);
13082         }
13083
13084         (void) strncpy(desc->dtpd_provider,
13085             (char *)(str + probe->dofp_provider),
13086             MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13087
13088         /* APPLE NOTE: Darwin employs size bounded string operation. */
13089         desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
13090
13091         if (probe->dofp_mod >= strtab->dofs_size) {
13092                 dtrace_dof_error(dof, "corrupt probe module");
13093                 return (NULL);
13094         }
13095
13096         (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13097             MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13098
13099         /* APPLE NOTE: Darwin employs size bounded string operation. */
13100         desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
13101
13102         if (probe->dofp_func >= strtab->dofs_size) {
13103                 dtrace_dof_error(dof, "corrupt probe function");
13104                 return (NULL);
13105         }
13106
13107         (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13108             MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13109
13110         /* APPLE NOTE: Darwin employs size bounded string operation. */
13111         desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
13112
13113         if (probe->dofp_name >= strtab->dofs_size) {
13114                 dtrace_dof_error(dof, "corrupt probe name");
13115                 return (NULL);
13116         }
13117
13118         (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13119             MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13120
13121         /* APPLE NOTE: Darwin employs size bounded string operation. */
13122         desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
13123
13124         return (desc);
13125 }
13126
13127 static dtrace_difo_t *
13128 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13129     cred_t *cr)
13130 {
13131         dtrace_difo_t *dp;
13132         size_t ttl = 0;
13133         dof_difohdr_t *dofd;
13134         uintptr_t daddr = (uintptr_t)dof;
13135         size_t max_size = dtrace_difo_maxsize;
13136         uint_t i;
13137         int l, n;
13138
13139
13140         static const struct {
13141                 int section;
13142                 int bufoffs;
13143                 int lenoffs;
13144                 int entsize;
13145                 int align;
13146                 const char *msg;
13147         } difo[] = {
13148                 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13149                 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13150                 sizeof (dif_instr_t), "multiple DIF sections" },
13151
13152                 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13153                 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13154                 sizeof (uint64_t), "multiple integer tables" },
13155
13156                 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13157                 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13158                 sizeof (char), "multiple string tables" },
13159
13160                 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13161                 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13162                 sizeof (uint_t), "multiple variable tables" },
13163
13164                 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13165         };
13166
13167         if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13168                 dtrace_dof_error(dof, "invalid DIFO header section");
13169                 return (NULL);
13170         }
13171
13172         if (sec->dofs_align != sizeof (dof_secidx_t)) {
13173                 dtrace_dof_error(dof, "bad alignment in DIFO header");
13174                 return (NULL);
13175         }
13176
13177         if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13178             sec->dofs_size % sizeof (dof_secidx_t)) {
13179                 dtrace_dof_error(dof, "bad size in DIFO header");
13180                 return (NULL);
13181         }
13182
13183         dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13184         n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13185
13186         dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13187         dp->dtdo_rtype = dofd->dofd_rtype;
13188
13189         for (l = 0; l < n; l++) {
13190                 dof_sec_t *subsec;
13191                 void **bufp;
13192                 uint32_t *lenp;
13193
13194                 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13195                     dofd->dofd_links[l])) == NULL)
13196                         goto err; /* invalid section link */
13197
13198                 if (ttl + subsec->dofs_size > max_size) {
13199                         dtrace_dof_error(dof, "exceeds maximum size");
13200                         goto err;
13201                 }
13202
13203                 ttl += subsec->dofs_size;
13204
13205                 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13206
13207                         if (subsec->dofs_type != (uint32_t)difo[i].section)
13208                                 continue;
13209
13210                         if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13211                                 dtrace_dof_error(dof, "section not loaded");
13212                                 goto err;
13213                         }
13214
13215                         if (subsec->dofs_align != (uint32_t)difo[i].align) {
13216                                 dtrace_dof_error(dof, "bad alignment");
13217                                 goto err;
13218                         }
13219
13220                         bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13221                         lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13222
13223                         if (*bufp != NULL) {
13224                                 dtrace_dof_error(dof, difo[i].msg);
13225                                 goto err;
13226                         }
13227
13228                         if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
13229                                 dtrace_dof_error(dof, "entry size mismatch");
13230                                 goto err;
13231                         }
13232
13233                         if (subsec->dofs_entsize != 0 &&
13234                             (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13235                                 dtrace_dof_error(dof, "corrupt entry size");
13236                                 goto err;
13237                         }
13238
13239                         *lenp = subsec->dofs_size;
13240                         *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13241                         bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13242                             *bufp, subsec->dofs_size);
13243
13244                         if (subsec->dofs_entsize != 0)
13245                                 *lenp /= subsec->dofs_entsize;
13246
13247                         break;
13248                 }
13249
13250                 /*
13251                  * If we encounter a loadable DIFO sub-section that is not
13252                  * known to us, assume this is a broken program and fail.
13253                  */
13254                 if (difo[i].section == DOF_SECT_NONE &&
13255                     (subsec->dofs_flags & DOF_SECF_LOAD)) {
13256                         dtrace_dof_error(dof, "unrecognized DIFO subsection");
13257                         goto err;
13258                 }
13259         }
13260
13261         if (dp->dtdo_buf == NULL) {
13262                 /*
13263                  * We can't have a DIF object without DIF text.
13264                  */
13265                 dtrace_dof_error(dof, "missing DIF text");
13266                 goto err;
13267         }
13268
13269         /*
13270          * Before we validate the DIF object, run through the variable table
13271          * looking for the strings -- if any of their size are under, we'll set
13272          * their size to be the system-wide default string size.  Note that
13273          * this should _not_ happen if the "strsize" option has been set --
13274          * in this case, the compiler should have set the size to reflect the
13275          * setting of the option.
13276          */
13277         for (i = 0; i < dp->dtdo_varlen; i++) {
13278                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13279                 dtrace_diftype_t *t = &v->dtdv_type;
13280
13281                 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13282                         continue;
13283
13284                 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13285                         t->dtdt_size = dtrace_strsize_default;
13286         }
13287
13288         if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13289                 goto err;
13290
13291         dtrace_difo_init(dp, vstate);
13292         return (dp);
13293
13294 err:
13295         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13296         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13297         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13298         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13299
13300         kmem_free(dp, sizeof (dtrace_difo_t));
13301         return (NULL);
13302 }
13303
13304 static dtrace_predicate_t *
13305 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13306     cred_t *cr)
13307 {
13308         dtrace_difo_t *dp;
13309
13310         if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13311                 return (NULL);
13312
13313         return (dtrace_predicate_create(dp));
13314 }
13315
13316 static dtrace_actdesc_t *
13317 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13318     cred_t *cr)
13319 {
13320         dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13321         dof_actdesc_t *desc;
13322         dof_sec_t *difosec;
13323         size_t offs;
13324         uintptr_t daddr = (uintptr_t)dof;
13325         uint64_t arg;
13326         dtrace_actkind_t kind;
13327
13328         if (sec->dofs_type != DOF_SECT_ACTDESC) {
13329                 dtrace_dof_error(dof, "invalid action section");
13330                 return (NULL);
13331         }
13332
13333         if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13334                 dtrace_dof_error(dof, "truncated action description");
13335                 return (NULL);
13336         }
13337
13338         if (sec->dofs_align != sizeof (uint64_t)) {
13339                 dtrace_dof_error(dof, "bad alignment in action description");
13340                 return (NULL);
13341         }
13342
13343         if (sec->dofs_size < sec->dofs_entsize) {
13344                 dtrace_dof_error(dof, "section entry size exceeds total size");
13345                 return (NULL);
13346         }
13347
13348         if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13349                 dtrace_dof_error(dof, "bad entry size in action description");
13350                 return (NULL);
13351         }
13352
13353         if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13354                 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13355                 return (NULL);
13356         }
13357
13358         for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13359                 desc = (dof_actdesc_t *)(daddr +
13360                     (uintptr_t)sec->dofs_offset + offs);
13361                 kind = (dtrace_actkind_t)desc->dofa_kind;
13362
13363                 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13364                     (kind != DTRACEACT_PRINTA || desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13365                     (kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE))
13366                 {
13367                         dof_sec_t *strtab;
13368                         char *str, *fmt;
13369                         uint64_t i;
13370
13371                         /*
13372                          * The argument to these actions is an index into the
13373                          * DOF string table.  For printf()-like actions, this
13374                          * is the format string.  For print(), this is the
13375                          * CTF type of the expression result.
13376                          */
13377                         if ((strtab = dtrace_dof_sect(dof,
13378                             DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13379                                 goto err;
13380
13381                         str = (char *)((uintptr_t)dof +
13382                             (uintptr_t)strtab->dofs_offset);
13383
13384                         for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13385                                 if (str[i] == '\0')
13386                                         break;
13387                         }
13388
13389                         if (i >= strtab->dofs_size) {
13390                                 dtrace_dof_error(dof, "bogus format string");
13391                                 goto err;
13392                         }
13393
13394                         if (i == desc->dofa_arg) {
13395                                 dtrace_dof_error(dof, "empty format string");
13396                                 goto err;
13397                         }
13398
13399                         i -= desc->dofa_arg;
13400                         fmt = kmem_alloc(i + 1, KM_SLEEP);
13401                         bcopy(&str[desc->dofa_arg], fmt, i + 1);
13402                         arg = (uint64_t)(uintptr_t)fmt;
13403                 } else {
13404                         if (kind == DTRACEACT_PRINTA) {
13405                                 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13406                                 arg = 0;
13407                         } else {
13408                                 arg = desc->dofa_arg;
13409                         }
13410                 }
13411
13412                 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13413                     desc->dofa_uarg, arg);
13414
13415                 if (last != NULL) {
13416                         last->dtad_next = act;
13417                 } else {
13418                         first = act;
13419                 }
13420
13421                 last = act;
13422
13423                 if (desc->dofa_difo == DOF_SECIDX_NONE)
13424                         continue;
13425
13426                 if ((difosec = dtrace_dof_sect(dof,
13427                     DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13428                         goto err;
13429
13430                 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13431
13432                 if (act->dtad_difo == NULL)
13433                         goto err;
13434         }
13435
13436         ASSERT(first != NULL);
13437         return (first);
13438
13439 err:
13440         for (act = first; act != NULL; act = next) {
13441                 next = act->dtad_next;
13442                 dtrace_actdesc_release(act, vstate);
13443         }
13444
13445         return (NULL);
13446 }
13447
13448 static dtrace_ecbdesc_t *
13449 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13450     cred_t *cr)
13451 {
13452         dtrace_ecbdesc_t *ep;
13453         dof_ecbdesc_t *ecb;
13454         dtrace_probedesc_t *desc;
13455         dtrace_predicate_t *pred = NULL;
13456
13457         if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13458                 dtrace_dof_error(dof, "truncated ECB description");
13459                 return (NULL);
13460         }
13461
13462         if (sec->dofs_align != sizeof (uint64_t)) {
13463                 dtrace_dof_error(dof, "bad alignment in ECB description");
13464                 return (NULL);
13465         }
13466
13467         ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13468         sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13469
13470         if (sec == NULL)
13471                 return (NULL);
13472
13473         ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13474         ep->dted_uarg = ecb->dofe_uarg;
13475         desc = &ep->dted_probe;
13476
13477         if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13478                 goto err;
13479
13480         if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13481                 if ((sec = dtrace_dof_sect(dof,
13482                     DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13483                         goto err;
13484
13485                 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13486                         goto err;
13487
13488                 ep->dted_pred.dtpdd_predicate = pred;
13489         }
13490
13491         if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13492                 if ((sec = dtrace_dof_sect(dof,
13493                     DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13494                         goto err;
13495
13496                 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13497
13498                 if (ep->dted_action == NULL)
13499                         goto err;
13500         }
13501
13502         return (ep);
13503
13504 err:
13505         if (pred != NULL)
13506                 dtrace_predicate_release(pred, vstate);
13507         kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13508         return (NULL);
13509 }
13510
13511 /*
13512  * APPLE NOTE: dyld handles dof relocation.
13513  * Darwin does not need dtrace_dof_relocate()
13514  */
13515
13516 /*
13517  * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13518  * header:  it should be at the front of a memory region that is at least
13519  * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13520  * size.  It need not be validated in any other way.
13521  */
13522 static int
13523 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13524     dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13525 {
13526 #pragma unused(ubase) /* __APPLE__ */
13527         uint64_t len = dof->dofh_loadsz, seclen;
13528         uintptr_t daddr = (uintptr_t)dof;
13529         dtrace_ecbdesc_t *ep;
13530         dtrace_enabling_t *enab;
13531         uint_t i;
13532
13533         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13534         ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13535
13536         /*
13537          * Check the DOF header identification bytes.  In addition to checking
13538          * valid settings, we also verify that unused bits/bytes are zeroed so
13539          * we can use them later without fear of regressing existing binaries.
13540          */
13541         if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13542             DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13543                 dtrace_dof_error(dof, "DOF magic string mismatch");
13544                 return (-1);
13545         }
13546
13547         if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13548             dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13549                 dtrace_dof_error(dof, "DOF has invalid data model");
13550                 return (-1);
13551         }
13552
13553         if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13554                 dtrace_dof_error(dof, "DOF encoding mismatch");
13555                 return (-1);
13556         }
13557
13558         /*
13559          * APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
13560          */
13561         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
13562                 dtrace_dof_error(dof, "DOF version mismatch");
13563                 return (-1);
13564         }
13565
13566         if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13567                 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13568                 return (-1);
13569         }
13570
13571         if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13572                 dtrace_dof_error(dof, "DOF uses too many integer registers");
13573                 return (-1);
13574         }
13575
13576         if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13577                 dtrace_dof_error(dof, "DOF uses too many tuple registers");
13578                 return (-1);
13579         }
13580
13581         for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13582                 if (dof->dofh_ident[i] != 0) {
13583                         dtrace_dof_error(dof, "DOF has invalid ident byte set");
13584                         return (-1);
13585                 }
13586         }
13587
13588         if (dof->dofh_flags & ~DOF_FL_VALID) {
13589                 dtrace_dof_error(dof, "DOF has invalid flag bits set");
13590                 return (-1);
13591         }
13592
13593         if (dof->dofh_secsize < sizeof(dof_sec_t)) {
13594                 dtrace_dof_error(dof, "invalid section header size");
13595                 return (-1);
13596         }
13597
13598         /*
13599          * Check that the section headers don't exceed the amount of DOF
13600          * data.  Note that we cast the section size and number of sections
13601          * to uint64_t's to prevent possible overflow in the multiplication.
13602          */
13603         seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13604
13605         if (dof->dofh_secoff > len || seclen > len ||
13606             dof->dofh_secoff + seclen > len) {
13607                 dtrace_dof_error(dof, "truncated section headers");
13608                 return (-1);
13609         }
13610
13611         if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13612                 dtrace_dof_error(dof, "misaligned section headers");
13613                 return (-1);
13614         }
13615
13616         if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13617                 dtrace_dof_error(dof, "misaligned section size");
13618                 return (-1);
13619         }
13620
13621         /*
13622          * Take an initial pass through the section headers to be sure that
13623          * the headers don't have stray offsets.  If the 'noprobes' flag is
13624          * set, do not permit sections relating to providers, probes, or args.
13625          */
13626         for (i = 0; i < dof->dofh_secnum; i++) {
13627                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13628                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13629
13630                 if (noprobes) {
13631                         switch (sec->dofs_type) {
13632                         case DOF_SECT_PROVIDER:
13633                         case DOF_SECT_PROBES:
13634                         case DOF_SECT_PRARGS:
13635                         case DOF_SECT_PROFFS:
13636                                 dtrace_dof_error(dof, "illegal sections "
13637                                     "for enabling");
13638                                 return (-1);
13639                         }
13640                 }
13641
13642                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13643                         continue; /* just ignore non-loadable sections */
13644
13645                 if (sec->dofs_align & (sec->dofs_align - 1)) {
13646                         dtrace_dof_error(dof, "bad section alignment");
13647                         return (-1);
13648                 }
13649
13650                 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13651                         dtrace_dof_error(dof, "misaligned section");
13652                         return (-1);
13653                 }
13654
13655                 if (sec->dofs_offset > len || sec->dofs_size > len ||
13656                     sec->dofs_offset + sec->dofs_size > len) {
13657                         dtrace_dof_error(dof, "corrupt section header");
13658                         return (-1);
13659                 }
13660
13661                 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13662                     sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13663                         dtrace_dof_error(dof, "non-terminating string table");
13664                         return (-1);
13665                 }
13666         }
13667
13668         /*
13669          * APPLE NOTE: We have no further relocation to perform.
13670          * All dof values are relative offsets.
13671          */
13672
13673         if ((enab = *enabp) == NULL)
13674                 enab = *enabp = dtrace_enabling_create(vstate);
13675
13676         for (i = 0; i < dof->dofh_secnum; i++) {
13677                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13678                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13679
13680                 if (sec->dofs_type != DOF_SECT_ECBDESC)
13681                         continue;
13682
13683                 /*
13684                  * APPLE NOTE: Defend against gcc 4.0 botch on x86.
13685                  * not all paths out of inlined dtrace_dof_ecbdesc
13686                  * are checked for the NULL return value.
13687                  * Check for NULL explicitly here.
13688                 */
13689                 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13690                 if (ep == NULL) {
13691                         dtrace_enabling_destroy(enab);
13692                         *enabp = NULL;
13693                         return (-1);
13694                 }
13695
13696                 dtrace_enabling_add(enab, ep);
13697         }
13698
13699         return (0);
13700 }
13701
13702 /*
13703  * Process DOF for any options.  This routine assumes that the DOF has been
13704  * at least processed by dtrace_dof_slurp().
13705  */
13706 static int
13707 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13708 {
13709         uint_t i;
13710         int rval;
13711         uint32_t entsize;
13712         size_t offs;
13713         dof_optdesc_t *desc;
13714
13715         for (i = 0; i < dof->dofh_secnum; i++) {
13716                 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13717                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13718
13719                 if (sec->dofs_type != DOF_SECT_OPTDESC)
13720                         continue;
13721
13722                 if (sec->dofs_align != sizeof (uint64_t)) {
13723                         dtrace_dof_error(dof, "bad alignment in "
13724                             "option description");
13725                         return (EINVAL);
13726                 }
13727
13728                 if ((entsize = sec->dofs_entsize) == 0) {
13729                         dtrace_dof_error(dof, "zeroed option entry size");
13730                         return (EINVAL);
13731                 }
13732
13733                 if (entsize < sizeof (dof_optdesc_t)) {
13734                         dtrace_dof_error(dof, "bad option entry size");
13735                         return (EINVAL);
13736                 }
13737
13738                 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13739                         desc = (dof_optdesc_t *)((uintptr_t)dof +
13740                             (uintptr_t)sec->dofs_offset + offs);
13741
13742                         if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13743                                 dtrace_dof_error(dof, "non-zero option string");
13744                                 return (EINVAL);
13745                         }
13746
13747                         if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13748                                 dtrace_dof_error(dof, "unset option");
13749                                 return (EINVAL);
13750                         }
13751
13752                         if ((rval = dtrace_state_option(state,
13753                             desc->dofo_option, desc->dofo_value)) != 0) {
13754                                 dtrace_dof_error(dof, "rejected option");
13755                                 return (rval);
13756                         }
13757                 }
13758         }
13759
13760         return (0);
13761 }
13762
13763 /*
13764  * DTrace Consumer State Functions
13765  */
13766 static int
13767 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13768 {
13769         size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13770         void *base;
13771         uintptr_t limit;
13772         dtrace_dynvar_t *dvar, *next, *start;
13773         size_t i;
13774
13775         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13776         ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13777
13778         bzero(dstate, sizeof (dtrace_dstate_t));
13779
13780         if ((dstate->dtds_chunksize = chunksize) == 0)
13781                 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13782
13783         VERIFY(dstate->dtds_chunksize < (LONG_MAX - sizeof (dtrace_dynhash_t)));
13784
13785         if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13786                 size = min_size;
13787
13788         if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13789                 return (ENOMEM);
13790
13791         dstate->dtds_size = size;
13792         dstate->dtds_base = base;
13793         dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13794         bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
13795
13796         hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13797
13798         if (hashsize != 1 && (hashsize & 1))
13799                 hashsize--;
13800
13801         dstate->dtds_hashsize = hashsize;
13802         dstate->dtds_hash = dstate->dtds_base;
13803
13804         /*
13805          * Set all of our hash buckets to point to the single sink, and (if
13806          * it hasn't already been set), set the sink's hash value to be the
13807          * sink sentinel value.  The sink is needed for dynamic variable
13808          * lookups to know that they have iterated over an entire, valid hash
13809          * chain.
13810          */
13811         for (i = 0; i < hashsize; i++)
13812                 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13813
13814         if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13815                 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13816
13817         /*
13818          * Determine number of active CPUs.  Divide free list evenly among
13819          * active CPUs.
13820          */
13821         start = (dtrace_dynvar_t *)
13822             ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13823         limit = (uintptr_t)base + size;
13824
13825         VERIFY((uintptr_t)start < limit);
13826         VERIFY((uintptr_t)start >= (uintptr_t)base);
13827
13828         maxper = (limit - (uintptr_t)start) / (int)NCPU;
13829         maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13830
13831         for (i = 0; i < NCPU; i++) {
13832                 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13833
13834                 /*
13835                  * If we don't even have enough chunks to make it once through
13836                  * NCPUs, we're just going to allocate everything to the first
13837                  * CPU.  And if we're on the last CPU, we're going to allocate
13838                  * whatever is left over.  In either case, we set the limit to
13839                  * be the limit of the dynamic variable space.
13840                  */
13841                 if (maxper == 0 || i == NCPU - 1) {
13842                         limit = (uintptr_t)base + size;
13843                         start = NULL;
13844                 } else {
13845                         limit = (uintptr_t)start + maxper;
13846                         start = (dtrace_dynvar_t *)limit;
13847                 }
13848
13849                 VERIFY(limit <= (uintptr_t)base + size);
13850
13851                 for (;;) {
13852                         next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13853                             dstate->dtds_chunksize);
13854
13855                         if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13856                                 break;
13857
13858                         VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
13859                             (uintptr_t)dvar <= (uintptr_t)base + size);
13860                         dvar->dtdv_next = next;
13861                         dvar = next;
13862                 }
13863
13864                 if (maxper == 0)
13865                         break;
13866         }
13867
13868         return (0);
13869 }
13870
13871 static void
13872 dtrace_dstate_fini(dtrace_dstate_t *dstate)
13873 {
13874         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13875
13876         if (dstate->dtds_base == NULL)
13877                 return;
13878
13879         kmem_free(dstate->dtds_base, dstate->dtds_size);
13880         kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13881 }
13882
13883 static void
13884 dtrace_vstate_fini(dtrace_vstate_t *vstate)
13885 {
13886         /*
13887          * Logical XOR, where are you?
13888          */
13889         ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13890
13891         if (vstate->dtvs_nglobals > 0) {
13892                 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13893                     sizeof (dtrace_statvar_t *));
13894         }
13895
13896         if (vstate->dtvs_ntlocals > 0) {
13897                 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13898                     sizeof (dtrace_difv_t));
13899         }
13900
13901         ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13902
13903         if (vstate->dtvs_nlocals > 0) {
13904                 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13905                     sizeof (dtrace_statvar_t *));
13906         }
13907 }
13908
13909 static void
13910 dtrace_state_clean(dtrace_state_t *state)
13911 {
13912         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13913                 return;
13914
13915         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13916         dtrace_speculation_clean(state);
13917 }
13918
13919 static void
13920 dtrace_state_deadman(dtrace_state_t *state)
13921 {
13922         hrtime_t now;
13923
13924         dtrace_sync();
13925
13926         now = dtrace_gethrtime();
13927
13928         if (state != dtrace_anon.dta_state &&
13929             now - state->dts_laststatus >= dtrace_deadman_user)
13930                 return;
13931
13932         /*
13933          * We must be sure that dts_alive never appears to be less than the
13934          * value upon entry to dtrace_state_deadman(), and because we lack a
13935          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
13936          * store INT64_MAX to it, followed by a memory barrier, followed by
13937          * the new value.  This assures that dts_alive never appears to be
13938          * less than its true value, regardless of the order in which the
13939          * stores to the underlying storage are issued.
13940          */
13941         state->dts_alive = INT64_MAX;
13942         dtrace_membar_producer();
13943         state->dts_alive = now;
13944 }
13945
13946 static int
13947 dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
13948 {
13949         minor_t minor;
13950         major_t major;
13951         char c[30];
13952         dtrace_state_t *state;
13953         dtrace_optval_t *opt;
13954         int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
13955         unsigned int cpu_it;
13956
13957         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13958         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13959
13960         /* Cause restart */
13961         *new_state = NULL;
13962
13963         if (devp != NULL) {
13964                 minor = getminor(*devp);
13965         }
13966         else {
13967                 minor = DTRACE_NCLIENTS - 1;
13968         }
13969
13970         state = dtrace_state_allocate(minor);
13971         if (NULL == state) {
13972                 printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor);
13973                 return (ERESTART);      /* can't reacquire */
13974         }
13975
13976         state->dts_epid = DTRACE_EPIDNONE + 1;
13977
13978         (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
13979         state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
13980             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
13981
13982         if (devp != NULL) {
13983                 major = getemajor(*devp);
13984         } else {
13985                 major = ddi_driver_major(dtrace_devi);
13986         }
13987
13988         state->dts_dev = makedev(major, minor);
13989
13990         if (devp != NULL)
13991                 *devp = state->dts_dev;
13992
13993         /*
13994          * We allocate NCPU buffers.  On the one hand, this can be quite
13995          * a bit of memory per instance (nearly 36K on a Starcat).  On the
13996          * other hand, it saves an additional memory reference in the probe
13997          * path.
13998          */
13999         state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14000         state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14001         state->dts_buf_over_limit = 0;
14002
14003         /*
14004          * Allocate and initialise the per-process per-CPU random state.
14005          * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
14006          * assumed to be seeded at this point (if from Fortuna seed file).
14007          */
14008         state->dts_rstate = kmem_zalloc(NCPU * sizeof(uint64_t*), KM_SLEEP);
14009         state->dts_rstate[0] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14010         (void) read_random(state->dts_rstate[0], 2 * sizeof(uint64_t));
14011         for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
14012                 state->dts_rstate[cpu_it] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14013                 /*
14014                  * Each CPU is assigned a 2^64 period, non-overlapping
14015                  * subsequence.
14016                  */
14017                 dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
14018                     state->dts_rstate[cpu_it]);
14019         }
14020
14021         state->dts_cleaner = CYCLIC_NONE;
14022         state->dts_deadman = CYCLIC_NONE;
14023         state->dts_vstate.dtvs_state = state;
14024
14025         for (i = 0; i < DTRACEOPT_MAX; i++)
14026                 state->dts_options[i] = DTRACEOPT_UNSET;
14027
14028         /*
14029          * Set the default options.
14030          */
14031         opt = state->dts_options;
14032         opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14033         opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14034         opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14035         opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14036         opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14037         opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14038         opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14039         opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14040         opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14041         opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14042         opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14043         opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14044         opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14045         opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14046         opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default;
14047
14048         /*
14049          * Depending on the user credentials, we set flag bits which alter probe
14050          * visibility or the amount of destructiveness allowed.  In the case of
14051          * actual anonymous tracing, or the possession of all privileges, all of
14052          * the normal checks are bypassed.
14053          */
14054 #if defined(__APPLE__)
14055         if (cr != NULL) {
14056                 kauth_cred_ref(cr);
14057                 state->dts_cred.dcr_cred = cr;
14058         }
14059         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14060                 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14061                         /*
14062                          * Allow only proc credentials when DTrace is
14063                          * restricted by the current security policy
14064                          */
14065                         state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC;
14066                         state->dts_cred.dcr_action = DTRACE_CRA_PROC | DTRACE_CRA_PROC_CONTROL | DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14067                 }
14068                 else {
14069                         state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14070                         state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14071                 }
14072         }
14073
14074 #else
14075         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14076                 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14077                 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14078         }
14079         else {
14080                 /*
14081                  * Set up the credentials for this instantiation.  We take a
14082                  * hold on the credential to prevent it from disappearing on
14083                  * us; this in turn prevents the zone_t referenced by this
14084                  * credential from disappearing.  This means that we can
14085                  * examine the credential and the zone from probe context.
14086                  */
14087                 crhold(cr);
14088                 state->dts_cred.dcr_cred = cr;
14089
14090                 /*
14091                  * CRA_PROC means "we have *some* privilege for dtrace" and
14092                  * unlocks the use of variables like pid, zonename, etc.
14093                  */
14094                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14095                     PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14096                         state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14097                 }
14098
14099                 /*
14100                  * dtrace_user allows use of syscall and profile providers.
14101                  * If the user also has proc_owner and/or proc_zone, we
14102                  * extend the scope to include additional visibility and
14103                  * destructive power.
14104                  */
14105                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14106                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14107                                 state->dts_cred.dcr_visible |=
14108                                     DTRACE_CRV_ALLPROC;
14109
14110                                 state->dts_cred.dcr_action |=
14111                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14112                         }
14113
14114                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14115                                 state->dts_cred.dcr_visible |=
14116                                     DTRACE_CRV_ALLZONE;
14117
14118                                 state->dts_cred.dcr_action |=
14119                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14120                         }
14121
14122                         /*
14123                          * If we have all privs in whatever zone this is,
14124                          * we can do destructive things to processes which
14125                          * have altered credentials.
14126                          *
14127                          * APPLE NOTE: Darwin doesn't do zones.
14128                          * Behave as if zone always has destructive privs.
14129                          */
14130
14131                         state->dts_cred.dcr_action |=
14132                                 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14133                 }
14134
14135                 /*
14136                  * Holding the dtrace_kernel privilege also implies that
14137                  * the user has the dtrace_user privilege from a visibility
14138                  * perspective.  But without further privileges, some
14139                  * destructive actions are not available.
14140                  */
14141                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14142                         /*
14143                          * Make all probes in all zones visible.  However,
14144                          * this doesn't mean that all actions become available
14145                          * to all zones.
14146                          */
14147                         state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14148                             DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14149
14150                         state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14151                             DTRACE_CRA_PROC;
14152                         /*
14153                          * Holding proc_owner means that destructive actions
14154                          * for *this* zone are allowed.
14155                          */
14156                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14157                                 state->dts_cred.dcr_action |=
14158                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14159
14160                         /*
14161                          * Holding proc_zone means that destructive actions
14162                          * for this user/group ID in all zones is allowed.
14163                          */
14164                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14165                                 state->dts_cred.dcr_action |=
14166                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14167
14168                         /*
14169                          * If we have all privs in whatever zone this is,
14170                          * we can do destructive things to processes which
14171                          * have altered credentials.
14172                          *
14173                          * APPLE NOTE: Darwin doesn't do zones.
14174                          * Behave as if zone always has destructive privs.
14175                          */
14176                         state->dts_cred.dcr_action |=
14177                                 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14178                 }
14179
14180                 /*
14181                  * Holding the dtrace_proc privilege gives control over fasttrap
14182                  * and pid providers.  We need to grant wider destructive
14183                  * privileges in the event that the user has proc_owner and/or
14184                  * proc_zone.
14185                  */
14186                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14187                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14188                                 state->dts_cred.dcr_action |=
14189                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14190
14191                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14192                                 state->dts_cred.dcr_action |=
14193                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14194                 }
14195         }
14196 #endif
14197
14198         *new_state = state;
14199         return(0);  /* Success */
14200 }
14201
14202 static int
14203 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14204 {
14205         dtrace_optval_t *opt = state->dts_options, size;
14206         processorid_t cpu = 0;
14207         size_t limit = buf->dtb_size;
14208         int flags = 0, rval;
14209
14210         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14211         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14212         ASSERT(which < DTRACEOPT_MAX);
14213         ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14214             (state == dtrace_anon.dta_state &&
14215             state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14216
14217         if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14218                 return (0);
14219
14220         if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14221                 cpu = opt[DTRACEOPT_CPU];
14222
14223         if (which == DTRACEOPT_SPECSIZE)
14224                 flags |= DTRACEBUF_NOSWITCH;
14225
14226         if (which == DTRACEOPT_BUFSIZE) {
14227                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14228                         flags |= DTRACEBUF_RING;
14229
14230                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14231                         flags |= DTRACEBUF_FILL;
14232
14233                 if (state != dtrace_anon.dta_state ||
14234                     state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14235                         flags |= DTRACEBUF_INACTIVE;
14236         }
14237
14238         for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
14239                 /*
14240                  * The size must be 8-byte aligned.  If the size is not 8-byte
14241                  * aligned, drop it down by the difference.
14242                  */
14243                 if (size & (sizeof (uint64_t) - 1))
14244                         size -= size & (sizeof (uint64_t) - 1);
14245
14246                 if (size < state->dts_reserve) {
14247                         /*
14248                          * Buffers always must be large enough to accommodate
14249                          * their prereserved space.  We return E2BIG instead
14250                          * of ENOMEM in this case to allow for user-level
14251                          * software to differentiate the cases.
14252                          */
14253                         return (E2BIG);
14254                 }
14255                 limit = opt[DTRACEOPT_BUFLIMIT] * size / 100;
14256                 rval = dtrace_buffer_alloc(buf, limit, size, flags, cpu);
14257
14258                 if (rval != ENOMEM) {
14259                         opt[which] = size;
14260                         return (rval);
14261                 }
14262
14263                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14264                         return (rval);
14265         }
14266
14267         return (ENOMEM);
14268 }
14269
14270 static int
14271 dtrace_state_buffers(dtrace_state_t *state)
14272 {
14273         dtrace_speculation_t *spec = state->dts_speculations;
14274         int rval, i;
14275
14276         if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14277             DTRACEOPT_BUFSIZE)) != 0)
14278                 return (rval);
14279
14280         if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14281             DTRACEOPT_AGGSIZE)) != 0)
14282                 return (rval);
14283
14284         for (i = 0; i < state->dts_nspeculations; i++) {
14285                 if ((rval = dtrace_state_buffer(state,
14286                     spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14287                         return (rval);
14288         }
14289
14290         return (0);
14291 }
14292
14293 static void
14294 dtrace_state_prereserve(dtrace_state_t *state)
14295 {
14296         dtrace_ecb_t *ecb;
14297         dtrace_probe_t *probe;
14298
14299         state->dts_reserve = 0;
14300
14301         if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14302                 return;
14303
14304         /*
14305          * If our buffer policy is a "fill" buffer policy, we need to set the
14306          * prereserved space to be the space required by the END probes.
14307          */
14308         probe = dtrace_probes[dtrace_probeid_end - 1];
14309         ASSERT(probe != NULL);
14310
14311         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14312                 if (ecb->dte_state != state)
14313                         continue;
14314
14315                 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14316         }
14317 }
14318
14319 static int
14320 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14321 {
14322         dtrace_optval_t *opt = state->dts_options, sz, nspec;
14323         dtrace_speculation_t *spec;
14324         dtrace_buffer_t *buf;
14325         cyc_handler_t hdlr;
14326         cyc_time_t when;
14327         int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14328         dtrace_icookie_t cookie;
14329
14330         lck_mtx_lock(&cpu_lock);
14331         lck_mtx_lock(&dtrace_lock);
14332
14333         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14334                 rval = EBUSY;
14335                 goto out;
14336         }
14337
14338         /*
14339          * Before we can perform any checks, we must prime all of the
14340          * retained enablings that correspond to this state.
14341          */
14342         dtrace_enabling_prime(state);
14343
14344         if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14345                 rval = EACCES;
14346                 goto out;
14347         }
14348
14349         dtrace_state_prereserve(state);
14350
14351         /*
14352          * Now we want to do is try to allocate our speculations.
14353          * We do not automatically resize the number of speculations; if
14354          * this fails, we will fail the operation.
14355          */
14356         nspec = opt[DTRACEOPT_NSPEC];
14357         ASSERT(nspec != DTRACEOPT_UNSET);
14358
14359         if (nspec > INT_MAX) {
14360                 rval = ENOMEM;
14361                 goto out;
14362         }
14363
14364         spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
14365
14366         if (spec == NULL) {
14367                 rval = ENOMEM;
14368                 goto out;
14369         }
14370
14371         state->dts_speculations = spec;
14372         state->dts_nspeculations = (int)nspec;
14373
14374         for (i = 0; i < nspec; i++) {
14375                 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
14376                         rval = ENOMEM;
14377                         goto err;
14378                 }
14379
14380                 spec[i].dtsp_buffer = buf;
14381         }
14382
14383         if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14384                 if (dtrace_anon.dta_state == NULL) {
14385                         rval = ENOENT;
14386                         goto out;
14387                 }
14388
14389                 if (state->dts_necbs != 0) {
14390                         rval = EALREADY;
14391                         goto out;
14392                 }
14393
14394                 state->dts_anon = dtrace_anon_grab();
14395                 ASSERT(state->dts_anon != NULL);
14396                 state = state->dts_anon;
14397
14398                 /*
14399                  * We want "grabanon" to be set in the grabbed state, so we'll
14400                  * copy that option value from the grabbing state into the
14401                  * grabbed state.
14402                  */
14403                 state->dts_options[DTRACEOPT_GRABANON] =
14404                     opt[DTRACEOPT_GRABANON];
14405
14406                 *cpu = dtrace_anon.dta_beganon;
14407
14408                 /*
14409                  * If the anonymous state is active (as it almost certainly
14410                  * is if the anonymous enabling ultimately matched anything),
14411                  * we don't allow any further option processing -- but we
14412                  * don't return failure.
14413                  */
14414                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14415                         goto out;
14416         }
14417
14418         if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14419             opt[DTRACEOPT_AGGSIZE] != 0) {
14420                 if (state->dts_aggregations == NULL) {
14421                         /*
14422                          * We're not going to create an aggregation buffer
14423                          * because we don't have any ECBs that contain
14424                          * aggregations -- set this option to 0.
14425                          */
14426                         opt[DTRACEOPT_AGGSIZE] = 0;
14427                 } else {
14428                         /*
14429                          * If we have an aggregation buffer, we must also have
14430                          * a buffer to use as scratch.
14431                          */
14432                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14433                           (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14434                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14435                         }
14436                 }
14437         }
14438
14439         if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14440             opt[DTRACEOPT_SPECSIZE] != 0) {
14441                 if (!state->dts_speculates) {
14442                         /*
14443                          * We're not going to create speculation buffers
14444                          * because we don't have any ECBs that actually
14445                          * speculate -- set the speculation size to 0.
14446                          */
14447                         opt[DTRACEOPT_SPECSIZE] = 0;
14448                 }
14449         }
14450
14451         /*
14452          * The bare minimum size for any buffer that we're actually going to
14453          * do anything to is sizeof (uint64_t).
14454          */
14455         sz = sizeof (uint64_t);
14456
14457         if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14458             (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14459             (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14460                 /*
14461                  * A buffer size has been explicitly set to 0 (or to a size
14462                  * that will be adjusted to 0) and we need the space -- we
14463                  * need to return failure.  We return ENOSPC to differentiate
14464                  * it from failing to allocate a buffer due to failure to meet
14465                  * the reserve (for which we return E2BIG).
14466                  */
14467                 rval = ENOSPC;
14468                 goto out;
14469         }
14470
14471         if ((rval = dtrace_state_buffers(state)) != 0)
14472                 goto err;
14473
14474         if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14475                 sz = dtrace_dstate_defsize;
14476
14477         do {
14478                 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14479
14480                 if (rval == 0)
14481                         break;
14482
14483                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14484                         goto err;
14485         } while (sz >>= 1);
14486
14487         opt[DTRACEOPT_DYNVARSIZE] = sz;
14488
14489         if (rval != 0)
14490                 goto err;
14491
14492         if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14493                 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14494
14495         if (opt[DTRACEOPT_CLEANRATE] == 0)
14496                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14497
14498         if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14499                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14500
14501         if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14502                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14503
14504         if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max)
14505                 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max;
14506
14507         if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min)
14508                 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min;
14509
14510         if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max)
14511                 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max;
14512
14513         if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min)
14514                 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min;
14515
14516         hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14517         hdlr.cyh_arg = state;
14518         hdlr.cyh_level = CY_LOW_LEVEL;
14519
14520         when.cyt_when = 0;
14521         when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14522
14523         state->dts_cleaner = cyclic_add(&hdlr, &when);
14524
14525         hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14526         hdlr.cyh_arg = state;
14527         hdlr.cyh_level = CY_LOW_LEVEL;
14528
14529         when.cyt_when = 0;
14530         when.cyt_interval = dtrace_deadman_interval;
14531
14532         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14533         state->dts_deadman = cyclic_add(&hdlr, &when);
14534
14535         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14536
14537         /*
14538          * Now it's time to actually fire the BEGIN probe.  We need to disable
14539          * interrupts here both to record the CPU on which we fired the BEGIN
14540          * probe (the data from this CPU will be processed first at user
14541          * level) and to manually activate the buffer for this CPU.
14542          */
14543         cookie = dtrace_interrupt_disable();
14544         *cpu = CPU->cpu_id;
14545         ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14546         state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14547
14548         dtrace_probe(dtrace_probeid_begin,
14549             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14550         dtrace_interrupt_enable(cookie);
14551         /*
14552          * We may have had an exit action from a BEGIN probe; only change our
14553          * state to ACTIVE if we're still in WARMUP.
14554          */
14555         ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14556             state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14557
14558         if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14559                 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14560
14561         /*
14562          * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14563          * want each CPU to transition its principal buffer out of the
14564          * INACTIVE state.  Doing this assures that no CPU will suddenly begin
14565          * processing an ECB halfway down a probe's ECB chain; all CPUs will
14566          * atomically transition from processing none of a state's ECBs to
14567          * processing all of them.
14568          */
14569         dtrace_xcall(DTRACE_CPUALL,
14570             (dtrace_xcall_t)dtrace_buffer_activate, state);
14571         goto out;
14572
14573 err:
14574         dtrace_buffer_free(state->dts_buffer);
14575         dtrace_buffer_free(state->dts_aggbuffer);
14576
14577         if ((nspec = state->dts_nspeculations) == 0) {
14578                 ASSERT(state->dts_speculations == NULL);
14579                 goto out;
14580         }
14581
14582         spec = state->dts_speculations;
14583         ASSERT(spec != NULL);
14584
14585         for (i = 0; i < state->dts_nspeculations; i++) {
14586                 if ((buf = spec[i].dtsp_buffer) == NULL)
14587                         break;
14588
14589                 dtrace_buffer_free(buf);
14590                 kmem_free(buf, bufsize);
14591         }
14592
14593         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14594         state->dts_nspeculations = 0;
14595         state->dts_speculations = NULL;
14596
14597 out:
14598         lck_mtx_unlock(&dtrace_lock);
14599         lck_mtx_unlock(&cpu_lock);
14600
14601         return (rval);
14602 }
14603
14604 static int
14605 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14606 {
14607         dtrace_icookie_t cookie;
14608
14609         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14610
14611         if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14612             state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14613                 return (EINVAL);
14614
14615         /*
14616          * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14617          * to be sure that every CPU has seen it.  See below for the details
14618          * on why this is done.
14619          */
14620         state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14621         dtrace_sync();
14622
14623         /*
14624          * By this point, it is impossible for any CPU to be still processing
14625          * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
14626          * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14627          * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
14628          * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14629          * iff we're in the END probe.
14630          */
14631         state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14632         dtrace_sync();
14633         ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14634
14635         /*
14636          * Finally, we can release the reserve and call the END probe.  We
14637          * disable interrupts across calling the END probe to allow us to
14638          * return the CPU on which we actually called the END probe.  This
14639          * allows user-land to be sure that this CPU's principal buffer is
14640          * processed last.
14641          */
14642         state->dts_reserve = 0;
14643
14644         cookie = dtrace_interrupt_disable();
14645         *cpu = CPU->cpu_id;
14646         dtrace_probe(dtrace_probeid_end,
14647             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14648         dtrace_interrupt_enable(cookie);
14649
14650         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14651         dtrace_sync();
14652
14653         return (0);
14654 }
14655
14656 static int
14657 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14658     dtrace_optval_t val)
14659 {
14660         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14661
14662         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14663                 return (EBUSY);
14664
14665         if (option >= DTRACEOPT_MAX)
14666                 return (EINVAL);
14667
14668         if (option != DTRACEOPT_CPU && val < 0)
14669                 return (EINVAL);
14670
14671         switch (option) {
14672         case DTRACEOPT_DESTRUCTIVE:
14673                 /*
14674                  * Prevent consumers from enabling destructive actions if DTrace
14675                  * is running in a restricted environment, or if actions are
14676                  * disallowed.
14677                  */
14678                 if (dtrace_is_restricted() || dtrace_destructive_disallow)
14679                         return (EACCES);
14680
14681                 state->dts_cred.dcr_destructive = 1;
14682                 break;
14683
14684         case DTRACEOPT_BUFSIZE:
14685         case DTRACEOPT_DYNVARSIZE:
14686         case DTRACEOPT_AGGSIZE:
14687         case DTRACEOPT_SPECSIZE:
14688         case DTRACEOPT_STRSIZE:
14689                 if (val < 0)
14690                         return (EINVAL);
14691
14692                 if (val >= LONG_MAX) {
14693                         /*
14694                          * If this is an otherwise negative value, set it to
14695                          * the highest multiple of 128m less than LONG_MAX.
14696                          * Technically, we're adjusting the size without
14697                          * regard to the buffer resizing policy, but in fact,
14698                          * this has no effect -- if we set the buffer size to
14699                          * ~LONG_MAX and the buffer policy is ultimately set to
14700                          * be "manual", the buffer allocation is guaranteed to
14701                          * fail, if only because the allocation requires two
14702                          * buffers.  (We set the the size to the highest
14703                          * multiple of 128m because it ensures that the size
14704                          * will remain a multiple of a megabyte when
14705                          * repeatedly halved -- all the way down to 15m.)
14706                          */
14707                         val = LONG_MAX - (1 << 27) + 1;
14708                 }
14709         }
14710
14711         state->dts_options[option] = val;
14712
14713         return (0);
14714 }
14715
14716 static void
14717 dtrace_state_destroy(dtrace_state_t *state)
14718 {
14719         dtrace_ecb_t *ecb;
14720         dtrace_vstate_t *vstate = &state->dts_vstate;
14721         minor_t minor = getminor(state->dts_dev);
14722         int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14723         dtrace_speculation_t *spec = state->dts_speculations;
14724         int nspec = state->dts_nspeculations;
14725         uint32_t match;
14726
14727         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14728         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14729
14730         /*
14731          * First, retract any retained enablings for this state.
14732          */
14733         dtrace_enabling_retract(state);
14734         ASSERT(state->dts_nretained == 0);
14735
14736         if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14737             state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14738                 /*
14739                  * We have managed to come into dtrace_state_destroy() on a
14740                  * hot enabling -- almost certainly because of a disorderly
14741                  * shutdown of a consumer.  (That is, a consumer that is
14742                  * exiting without having called dtrace_stop().) In this case,
14743                  * we're going to set our activity to be KILLED, and then
14744                  * issue a sync to be sure that everyone is out of probe
14745                  * context before we start blowing away ECBs.
14746                  */
14747                 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14748                 dtrace_sync();
14749         }
14750
14751         /*
14752          * Release the credential hold we took in dtrace_state_create().
14753          */
14754         if (state->dts_cred.dcr_cred != NULL)
14755                 kauth_cred_unref(&state->dts_cred.dcr_cred);
14756
14757         /*
14758          * Now we can safely disable and destroy any enabled probes.  Because
14759          * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14760          * (especially if they're all enabled), we take two passes through the
14761          * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14762          * in the second we disable whatever is left over.
14763          */
14764         for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14765                 for (i = 0; i < state->dts_necbs; i++) {
14766                         if ((ecb = state->dts_ecbs[i]) == NULL)
14767                                 continue;
14768
14769                         if (match && ecb->dte_probe != NULL) {
14770                                 dtrace_probe_t *probe = ecb->dte_probe;
14771                                 dtrace_provider_t *prov = probe->dtpr_provider;
14772
14773                                 if (!(prov->dtpv_priv.dtpp_flags & match))
14774                                         continue;
14775                         }
14776
14777                         dtrace_ecb_disable(ecb);
14778                         dtrace_ecb_destroy(ecb);
14779                 }
14780
14781                 if (!match)
14782                         break;
14783         }
14784
14785         /*
14786          * Before we free the buffers, perform one more sync to assure that
14787          * every CPU is out of probe context.
14788          */
14789         dtrace_sync();
14790
14791         dtrace_buffer_free(state->dts_buffer);
14792         dtrace_buffer_free(state->dts_aggbuffer);
14793
14794         for (i = 0; i < (int)NCPU; i++) {
14795                 kmem_free(state->dts_rstate[i], 2 * sizeof(uint64_t));
14796         }
14797         kmem_free(state->dts_rstate, NCPU * sizeof(uint64_t*));
14798
14799         for (i = 0; i < nspec; i++)
14800                 dtrace_buffer_free(spec[i].dtsp_buffer);
14801
14802         if (state->dts_cleaner != CYCLIC_NONE)
14803                 cyclic_remove(state->dts_cleaner);
14804
14805         if (state->dts_deadman != CYCLIC_NONE)
14806                 cyclic_remove(state->dts_deadman);
14807
14808         dtrace_dstate_fini(&vstate->dtvs_dynvars);
14809         dtrace_vstate_fini(vstate);
14810         kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14811
14812         if (state->dts_aggregations != NULL) {
14813 #if DEBUG
14814                 for (i = 0; i < state->dts_naggregations; i++)
14815                         ASSERT(state->dts_aggregations[i] == NULL);
14816 #endif
14817                 ASSERT(state->dts_naggregations > 0);
14818                 kmem_free(state->dts_aggregations,
14819                     state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14820         }
14821
14822         kmem_free(state->dts_buffer, bufsize);
14823         kmem_free(state->dts_aggbuffer, bufsize);
14824
14825         for (i = 0; i < nspec; i++)
14826                 kmem_free(spec[i].dtsp_buffer, bufsize);
14827
14828         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14829
14830         dtrace_format_destroy(state);
14831
14832         vmem_destroy(state->dts_aggid_arena);
14833         dtrace_state_free(minor);
14834 }
14835
14836 /*
14837  * DTrace Anonymous Enabling Functions
14838  */
14839
14840 int
14841 dtrace_keep_kernel_symbols(void)
14842 {
14843         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14844                 return 0;
14845         }
14846
14847         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
14848                 return 1;
14849
14850         return 0;
14851 }
14852
14853 static dtrace_state_t *
14854 dtrace_anon_grab(void)
14855 {
14856         dtrace_state_t *state;
14857
14858         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14859
14860         if ((state = dtrace_anon.dta_state) == NULL) {
14861                 ASSERT(dtrace_anon.dta_enabling == NULL);
14862                 return (NULL);
14863         }
14864
14865         ASSERT(dtrace_anon.dta_enabling != NULL);
14866         ASSERT(dtrace_retained != NULL);
14867
14868         dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14869         dtrace_anon.dta_enabling = NULL;
14870         dtrace_anon.dta_state = NULL;
14871
14872         return (state);
14873 }
14874
14875 static void
14876 dtrace_anon_property(void)
14877 {
14878         int i, rv;
14879         dtrace_state_t *state;
14880         dof_hdr_t *dof;
14881         char c[32];             /* enough for "dof-data-" + digits */
14882
14883         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14884         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14885
14886         for (i = 0; ; i++) {
14887                 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
14888
14889                 dtrace_err_verbose = 1;
14890
14891                 if ((dof = dtrace_dof_property(c)) == NULL) {
14892                         dtrace_err_verbose = 0;
14893                         break;
14894                 }
14895
14896 #ifdef illumos
14897                 /*
14898                  * We want to create anonymous state, so we need to transition
14899                  * the kernel debugger to indicate that DTrace is active.  If
14900                  * this fails (e.g. because the debugger has modified text in
14901                  * some way), we won't continue with the processing.
14902                  */
14903                 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14904                         cmn_err(CE_NOTE, "kernel debugger active; anonymous "
14905                             "enabling ignored.");
14906                         dtrace_dof_destroy(dof);
14907                         break;
14908                 }
14909 #endif
14910
14911                 /*
14912                  * If we haven't allocated an anonymous state, we'll do so now.
14913                  */
14914                 if ((state = dtrace_anon.dta_state) == NULL) {
14915                         rv = dtrace_state_create(NULL, NULL, &state);
14916                         dtrace_anon.dta_state = state;
14917                         if (rv != 0 || state == NULL) {
14918                                 /*
14919                                  * This basically shouldn't happen:  the only
14920                                  * failure mode from dtrace_state_create() is a
14921                                  * failure of ddi_soft_state_zalloc() that
14922                                  * itself should never happen.  Still, the
14923                                  * interface allows for a failure mode, and
14924                                  * we want to fail as gracefully as possible:
14925                                  * we'll emit an error message and cease
14926                                  * processing anonymous state in this case.
14927                                  */
14928                                 cmn_err(CE_WARN, "failed to create "
14929                                     "anonymous state");
14930                                 dtrace_dof_destroy(dof);
14931                                 break;
14932                         }
14933                 }
14934
14935                 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
14936                     &dtrace_anon.dta_enabling, 0, B_TRUE);
14937
14938                 if (rv == 0)
14939                         rv = dtrace_dof_options(dof, state);
14940
14941                 dtrace_err_verbose = 0;
14942                 dtrace_dof_destroy(dof);
14943
14944                 if (rv != 0) {
14945                         /*
14946                          * This is malformed DOF; chuck any anonymous state
14947                          * that we created.
14948                          */
14949                         ASSERT(dtrace_anon.dta_enabling == NULL);
14950                         dtrace_state_destroy(state);
14951                         dtrace_anon.dta_state = NULL;
14952                         break;
14953                 }
14954
14955                 ASSERT(dtrace_anon.dta_enabling != NULL);
14956         }
14957
14958         if (dtrace_anon.dta_enabling != NULL) {
14959                 int rval;
14960
14961                 /*
14962                  * dtrace_enabling_retain() can only fail because we are
14963                  * trying to retain more enablings than are allowed -- but
14964                  * we only have one anonymous enabling, and we are guaranteed
14965                  * to be allowed at least one retained enabling; we assert
14966                  * that dtrace_enabling_retain() returns success.
14967                  */
14968                 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
14969                 ASSERT(rval == 0);
14970
14971                 dtrace_enabling_dump(dtrace_anon.dta_enabling);
14972         }
14973 }
14974
14975 /*
14976  * DTrace Helper Functions
14977  */
14978 static void
14979 dtrace_helper_trace(dtrace_helper_action_t *helper,
14980     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
14981 {
14982         uint32_t size, next, nnext;
14983         int i;
14984         dtrace_helptrace_t *ent;
14985         uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14986
14987         if (!dtrace_helptrace_enabled)
14988                 return;
14989
14990         ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
14991
14992         /*
14993          * What would a tracing framework be without its own tracing
14994          * framework?  (Well, a hell of a lot simpler, for starters...)
14995          */
14996         size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
14997             sizeof (uint64_t) - sizeof (uint64_t);
14998
14999         /*
15000          * Iterate until we can allocate a slot in the trace buffer.
15001          */
15002         do {
15003                 next = dtrace_helptrace_next;
15004
15005                 if (next + size < dtrace_helptrace_bufsize) {
15006                         nnext = next + size;
15007                 } else {
15008                         nnext = size;
15009                 }
15010         } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15011
15012         /*
15013          * We have our slot; fill it in.
15014          */
15015         if (nnext == size)
15016                 next = 0;
15017
15018         ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15019         ent->dtht_helper = helper;
15020         ent->dtht_where = where;
15021         ent->dtht_nlocals = vstate->dtvs_nlocals;
15022
15023         ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15024             mstate->dtms_fltoffs : -1;
15025         ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15026         ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
15027
15028         for (i = 0; i < vstate->dtvs_nlocals; i++) {
15029                 dtrace_statvar_t *svar;
15030
15031                 if ((svar = vstate->dtvs_locals[i]) == NULL)
15032                         continue;
15033
15034                 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
15035                 ent->dtht_locals[i] =
15036                     ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
15037         }
15038 }
15039
15040 static uint64_t
15041 dtrace_helper(int which, dtrace_mstate_t *mstate,
15042     dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15043 {
15044         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15045         uint64_t sarg0 = mstate->dtms_arg[0];
15046         uint64_t sarg1 = mstate->dtms_arg[1];
15047         uint64_t rval = 0;
15048         dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15049         dtrace_helper_action_t *helper;
15050         dtrace_vstate_t *vstate;
15051         dtrace_difo_t *pred;
15052         int i, trace = dtrace_helptrace_enabled;
15053
15054         ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15055
15056         if (helpers == NULL)
15057                 return (0);
15058
15059         if ((helper = helpers->dthps_actions[which]) == NULL)
15060                 return (0);
15061
15062         vstate = &helpers->dthps_vstate;
15063         mstate->dtms_arg[0] = arg0;
15064         mstate->dtms_arg[1] = arg1;
15065
15066         /*
15067          * Now iterate over each helper.  If its predicate evaluates to 'true',
15068          * we'll call the corresponding actions.  Note that the below calls
15069          * to dtrace_dif_emulate() may set faults in machine state.  This is
15070          * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
15071          * the stored DIF offset with its own (which is the desired behavior).
15072          * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15073          * from machine state; this is okay, too.
15074          */
15075         for (; helper != NULL; helper = helper->dtha_next) {
15076                 if ((pred = helper->dtha_predicate) != NULL) {
15077                         if (trace)
15078                                 dtrace_helper_trace(helper, mstate, vstate, 0);
15079
15080                         if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15081                                 goto next;
15082
15083                         if (*flags & CPU_DTRACE_FAULT)
15084                                 goto err;
15085                 }
15086
15087                 for (i = 0; i < helper->dtha_nactions; i++) {
15088                         if (trace)
15089                                 dtrace_helper_trace(helper,
15090                                     mstate, vstate, i + 1);
15091
15092                         rval = dtrace_dif_emulate(helper->dtha_actions[i],
15093                             mstate, vstate, state);
15094
15095                         if (*flags & CPU_DTRACE_FAULT)
15096                                 goto err;
15097                 }
15098
15099 next:
15100                 if (trace)
15101                         dtrace_helper_trace(helper, mstate, vstate,
15102                             DTRACE_HELPTRACE_NEXT);
15103         }
15104
15105         if (trace)
15106                 dtrace_helper_trace(helper, mstate, vstate,
15107                     DTRACE_HELPTRACE_DONE);
15108
15109         /*
15110          * Restore the arg0 that we saved upon entry.
15111          */
15112         mstate->dtms_arg[0] = sarg0;
15113         mstate->dtms_arg[1] = sarg1;
15114
15115         return (rval);
15116
15117 err:
15118         if (trace)
15119                 dtrace_helper_trace(helper, mstate, vstate,
15120                     DTRACE_HELPTRACE_ERR);
15121
15122         /*
15123          * Restore the arg0 that we saved upon entry.
15124          */
15125         mstate->dtms_arg[0] = sarg0;
15126         mstate->dtms_arg[1] = sarg1;
15127
15128         return (0);
15129 }
15130
15131 static void
15132 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15133     dtrace_vstate_t *vstate)
15134 {
15135         int i;
15136
15137         if (helper->dtha_predicate != NULL)
15138                 dtrace_difo_release(helper->dtha_predicate, vstate);
15139
15140         for (i = 0; i < helper->dtha_nactions; i++) {
15141                 ASSERT(helper->dtha_actions[i] != NULL);
15142                 dtrace_difo_release(helper->dtha_actions[i], vstate);
15143         }
15144
15145         kmem_free(helper->dtha_actions,
15146             helper->dtha_nactions * sizeof (dtrace_difo_t *));
15147         kmem_free(helper, sizeof (dtrace_helper_action_t));
15148 }
15149
15150 static int
15151 dtrace_helper_destroygen(proc_t* p, int gen)
15152 {
15153         dtrace_helpers_t *help = p->p_dtrace_helpers;
15154         dtrace_vstate_t *vstate;
15155         uint_t i;
15156
15157         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15158         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15159
15160         if (help == NULL || gen > help->dthps_generation)
15161                 return (EINVAL);
15162
15163         vstate = &help->dthps_vstate;
15164
15165         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15166                 dtrace_helper_action_t *last = NULL, *h, *next;
15167
15168                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15169                         next = h->dtha_next;
15170
15171                         if (h->dtha_generation == gen) {
15172                                 if (last != NULL) {
15173                                         last->dtha_next = next;
15174                                 } else {
15175                                         help->dthps_actions[i] = next;
15176                                 }
15177
15178                                 dtrace_helper_action_destroy(h, vstate);
15179                         } else {
15180                                 last = h;
15181                         }
15182                 }
15183         }
15184
15185         /*
15186          * Interate until we've cleared out all helper providers with the
15187          * given generation number.
15188          */
15189         for (;;) {
15190                 dtrace_helper_provider_t *prov = NULL;
15191
15192                 /*
15193                  * Look for a helper provider with the right generation. We
15194                  * have to start back at the beginning of the list each time
15195                  * because we drop dtrace_lock. It's unlikely that we'll make
15196                  * more than two passes.
15197                  */
15198                 for (i = 0; i < help->dthps_nprovs; i++) {
15199                         prov = help->dthps_provs[i];
15200
15201                         if (prov->dthp_generation == gen)
15202                                 break;
15203                 }
15204
15205                 /*
15206                  * If there were no matches, we're done.
15207                  */
15208                 if (i == help->dthps_nprovs)
15209                         break;
15210
15211                 /*
15212                  * Move the last helper provider into this slot.
15213                  */
15214                 help->dthps_nprovs--;
15215                 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15216                 help->dthps_provs[help->dthps_nprovs] = NULL;
15217
15218                 lck_mtx_unlock(&dtrace_lock);
15219
15220                 /*
15221                  * If we have a meta provider, remove this helper provider.
15222                  */
15223                 if (dtrace_meta_pid != NULL) {
15224                         ASSERT(dtrace_deferred_pid == NULL);
15225                         dtrace_helper_provider_remove(&prov->dthp_prov,
15226                             p);
15227                 }
15228
15229                 dtrace_helper_provider_destroy(prov);
15230
15231                 lck_mtx_lock(&dtrace_lock);
15232         }
15233
15234         return (0);
15235 }
15236
15237 static int
15238 dtrace_helper_validate(dtrace_helper_action_t *helper)
15239 {
15240         int err = 0, i;
15241         dtrace_difo_t *dp;
15242
15243         if ((dp = helper->dtha_predicate) != NULL)
15244                 err += dtrace_difo_validate_helper(dp);
15245
15246         for (i = 0; i < helper->dtha_nactions; i++)
15247                 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15248
15249         return (err == 0);
15250 }
15251
15252 static int
15253 dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
15254 {
15255         dtrace_helpers_t *help;
15256         dtrace_helper_action_t *helper, *last;
15257         dtrace_actdesc_t *act;
15258         dtrace_vstate_t *vstate;
15259         dtrace_predicate_t *pred;
15260         int count = 0, nactions = 0, i;
15261
15262         if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15263                 return (EINVAL);
15264
15265         help = p->p_dtrace_helpers;
15266         last = help->dthps_actions[which];
15267         vstate = &help->dthps_vstate;
15268
15269         for (count = 0; last != NULL; last = last->dtha_next) {
15270                 count++;
15271                 if (last->dtha_next == NULL)
15272                         break;
15273         }
15274
15275         /*
15276          * If we already have dtrace_helper_actions_max helper actions for this
15277          * helper action type, we'll refuse to add a new one.
15278          */
15279         if (count >= dtrace_helper_actions_max)
15280                 return (ENOSPC);
15281
15282         helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15283         helper->dtha_generation = help->dthps_generation;
15284
15285         if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15286                 ASSERT(pred->dtp_difo != NULL);
15287                 dtrace_difo_hold(pred->dtp_difo);
15288                 helper->dtha_predicate = pred->dtp_difo;
15289         }
15290
15291         for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15292                 if (act->dtad_kind != DTRACEACT_DIFEXPR)
15293                         goto err;
15294
15295                 if (act->dtad_difo == NULL)
15296                         goto err;
15297
15298                 nactions++;
15299         }
15300
15301         helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15302             (helper->dtha_nactions = nactions), KM_SLEEP);
15303
15304         for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15305                 dtrace_difo_hold(act->dtad_difo);
15306                 helper->dtha_actions[i++] = act->dtad_difo;
15307         }
15308
15309         if (!dtrace_helper_validate(helper))
15310                 goto err;
15311
15312         if (last == NULL) {
15313                 help->dthps_actions[which] = helper;
15314         } else {
15315                 last->dtha_next = helper;
15316         }
15317
15318         if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15319                 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15320                 dtrace_helptrace_next = 0;
15321         }
15322
15323         return (0);
15324 err:
15325         dtrace_helper_action_destroy(helper, vstate);
15326         return (EINVAL);
15327 }
15328
15329 static void
15330 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15331     dof_helper_t *dofhp)
15332 {
15333         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15334         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15335
15336         lck_mtx_lock(&dtrace_lock);
15337
15338         if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15339                 /*
15340                  * If the dtrace module is loaded but not attached, or if
15341                  * there aren't isn't a meta provider registered to deal with
15342                  * these provider descriptions, we need to postpone creating
15343                  * the actual providers until later.
15344                  */
15345
15346                 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15347                     dtrace_deferred_pid != help) {
15348                         help->dthps_deferred = 1;
15349                         help->dthps_pid = p->p_pid;
15350                         help->dthps_next = dtrace_deferred_pid;
15351                         help->dthps_prev = NULL;
15352                         if (dtrace_deferred_pid != NULL)
15353                                 dtrace_deferred_pid->dthps_prev = help;
15354                         dtrace_deferred_pid = help;
15355                 }
15356
15357                 lck_mtx_unlock(&dtrace_lock);
15358
15359         } else if (dofhp != NULL) {
15360                 /*
15361                  * If the dtrace module is loaded and we have a particular
15362                  * helper provider description, pass that off to the
15363                  * meta provider.
15364                  */
15365
15366                 lck_mtx_unlock(&dtrace_lock);
15367
15368                 dtrace_helper_provide(dofhp, p);
15369
15370         } else {
15371                 /*
15372                  * Otherwise, just pass all the helper provider descriptions
15373                  * off to the meta provider.
15374                  */
15375
15376                 uint_t i;
15377                 lck_mtx_unlock(&dtrace_lock);
15378
15379                 for (i = 0; i < help->dthps_nprovs; i++) {
15380                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15381                                 p);
15382                 }
15383         }
15384 }
15385
15386 static int
15387 dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
15388 {
15389         dtrace_helpers_t *help;
15390         dtrace_helper_provider_t *hprov, **tmp_provs;
15391         uint_t tmp_maxprovs, i;
15392
15393         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15394         help = p->p_dtrace_helpers;
15395         ASSERT(help != NULL);
15396
15397         /*
15398          * If we already have dtrace_helper_providers_max helper providers,
15399          * we're refuse to add a new one.
15400          */
15401         if (help->dthps_nprovs >= dtrace_helper_providers_max)
15402                 return (ENOSPC);
15403
15404         /*
15405          * Check to make sure this isn't a duplicate.
15406          */
15407         for (i = 0; i < help->dthps_nprovs; i++) {
15408                 if (dofhp->dofhp_addr ==
15409                     help->dthps_provs[i]->dthp_prov.dofhp_addr)
15410                         return (EALREADY);
15411         }
15412
15413         hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15414         hprov->dthp_prov = *dofhp;
15415         hprov->dthp_ref = 1;
15416         hprov->dthp_generation = gen;
15417
15418         /*
15419          * Allocate a bigger table for helper providers if it's already full.
15420          */
15421         if (help->dthps_maxprovs == help->dthps_nprovs) {
15422                 tmp_maxprovs = help->dthps_maxprovs;
15423                 tmp_provs = help->dthps_provs;
15424
15425                 if (help->dthps_maxprovs == 0)
15426                         help->dthps_maxprovs = 2;
15427                 else
15428                         help->dthps_maxprovs *= 2;
15429                 if (help->dthps_maxprovs > dtrace_helper_providers_max)
15430                         help->dthps_maxprovs = dtrace_helper_providers_max;
15431
15432                 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15433
15434                 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15435                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15436
15437                 if (tmp_provs != NULL) {
15438                         bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15439                             sizeof (dtrace_helper_provider_t *));
15440                         kmem_free(tmp_provs, tmp_maxprovs *
15441                             sizeof (dtrace_helper_provider_t *));
15442                 }
15443         }
15444
15445         help->dthps_provs[help->dthps_nprovs] = hprov;
15446         help->dthps_nprovs++;
15447
15448         return (0);
15449 }
15450
15451 static void
15452 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15453 {
15454         lck_mtx_lock(&dtrace_lock);
15455
15456         if (--hprov->dthp_ref == 0) {
15457                 dof_hdr_t *dof;
15458                 lck_mtx_unlock(&dtrace_lock);
15459                 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15460                 dtrace_dof_destroy(dof);
15461                 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15462         } else {
15463                 lck_mtx_unlock(&dtrace_lock);
15464         }
15465 }
15466
15467 static int
15468 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15469 {
15470         uintptr_t daddr = (uintptr_t)dof;
15471         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15472         dof_provider_t *provider;
15473         dof_probe_t *probe;
15474         uint8_t *arg;
15475         char *strtab, *typestr;
15476         dof_stridx_t typeidx;
15477         size_t typesz;
15478         uint_t nprobes, j, k;
15479
15480         ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15481
15482         if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15483                 dtrace_dof_error(dof, "misaligned section offset");
15484                 return (-1);
15485         }
15486
15487         /*
15488          * The section needs to be large enough to contain the DOF provider
15489          * structure appropriate for the given version.
15490          */
15491         if (sec->dofs_size <
15492             ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15493             offsetof(dof_provider_t, dofpv_prenoffs) :
15494             sizeof (dof_provider_t))) {
15495                 dtrace_dof_error(dof, "provider section too small");
15496                 return (-1);
15497         }
15498
15499         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15500         str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15501         prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15502         arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15503         off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15504
15505         if (str_sec == NULL || prb_sec == NULL ||
15506             arg_sec == NULL || off_sec == NULL)
15507                 return (-1);
15508
15509         enoff_sec = NULL;
15510
15511         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15512             provider->dofpv_prenoffs != DOF_SECT_NONE &&
15513             (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15514             provider->dofpv_prenoffs)) == NULL)
15515                 return (-1);
15516
15517         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15518
15519         if (provider->dofpv_name >= str_sec->dofs_size ||
15520             strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15521                 dtrace_dof_error(dof, "invalid provider name");
15522                 return (-1);
15523         }
15524
15525         if (prb_sec->dofs_entsize == 0 ||
15526             prb_sec->dofs_entsize > prb_sec->dofs_size) {
15527                 dtrace_dof_error(dof, "invalid entry size");
15528                 return (-1);
15529         }
15530
15531         if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15532                 dtrace_dof_error(dof, "misaligned entry size");
15533                 return (-1);
15534         }
15535
15536         if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15537                 dtrace_dof_error(dof, "invalid entry size");
15538                 return (-1);
15539         }
15540
15541         if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15542                 dtrace_dof_error(dof, "misaligned section offset");
15543                 return (-1);
15544         }
15545
15546         if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15547                 dtrace_dof_error(dof, "invalid entry size");
15548                 return (-1);
15549         }
15550
15551         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15552
15553         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15554
15555         /*
15556          * Take a pass through the probes to check for errors.
15557          */
15558         for (j = 0; j < nprobes; j++) {
15559                 probe = (dof_probe_t *)(uintptr_t)(daddr +
15560                     prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15561
15562                 if (probe->dofpr_func >= str_sec->dofs_size) {
15563                         dtrace_dof_error(dof, "invalid function name");
15564                         return (-1);
15565                 }
15566
15567                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15568                         dtrace_dof_error(dof, "function name too long");
15569                         return (-1);
15570                 }
15571
15572                 if (probe->dofpr_name >= str_sec->dofs_size ||
15573                     strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15574                         dtrace_dof_error(dof, "invalid probe name");
15575                         return (-1);
15576                 }
15577
15578                 /*
15579                  * The offset count must not wrap the index, and the offsets
15580                  * must also not overflow the section's data.
15581                  */
15582                 if (probe->dofpr_offidx + probe->dofpr_noffs <
15583                     probe->dofpr_offidx ||
15584                     (probe->dofpr_offidx + probe->dofpr_noffs) *
15585                     off_sec->dofs_entsize > off_sec->dofs_size) {
15586                         dtrace_dof_error(dof, "invalid probe offset");
15587                         return (-1);
15588                 }
15589
15590                 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15591                         /*
15592                          * If there's no is-enabled offset section, make sure
15593                          * there aren't any is-enabled offsets. Otherwise
15594                          * perform the same checks as for probe offsets
15595                          * (immediately above).
15596                          */
15597                         if (enoff_sec == NULL) {
15598                                 if (probe->dofpr_enoffidx != 0 ||
15599                                     probe->dofpr_nenoffs != 0) {
15600                                         dtrace_dof_error(dof, "is-enabled "
15601                                             "offsets with null section");
15602                                         return (-1);
15603                                 }
15604                         } else if (probe->dofpr_enoffidx +
15605                             probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15606                             (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15607                             enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15608                                 dtrace_dof_error(dof, "invalid is-enabled "
15609                                     "offset");
15610                                 return (-1);
15611                         }
15612
15613                         if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15614                                 dtrace_dof_error(dof, "zero probe and "
15615                                     "is-enabled offsets");
15616                                 return (-1);
15617                         }
15618                 } else if (probe->dofpr_noffs == 0) {
15619                         dtrace_dof_error(dof, "zero probe offsets");
15620                         return (-1);
15621                 }
15622
15623                 if (probe->dofpr_argidx + probe->dofpr_xargc <
15624                     probe->dofpr_argidx ||
15625                     (probe->dofpr_argidx + probe->dofpr_xargc) *
15626                     arg_sec->dofs_entsize > arg_sec->dofs_size) {
15627                         dtrace_dof_error(dof, "invalid args");
15628                         return (-1);
15629                 }
15630
15631                 typeidx = probe->dofpr_nargv;
15632                 typestr = strtab + probe->dofpr_nargv;
15633                 for (k = 0; k < probe->dofpr_nargc; k++) {
15634                         if (typeidx >= str_sec->dofs_size) {
15635                                 dtrace_dof_error(dof, "bad "
15636                                     "native argument type");
15637                                 return (-1);
15638                         }
15639
15640                         typesz = strlen(typestr) + 1;
15641                         if (typesz > DTRACE_ARGTYPELEN) {
15642                                 dtrace_dof_error(dof, "native "
15643                                     "argument type too long");
15644                                 return (-1);
15645                         }
15646                         typeidx += typesz;
15647                         typestr += typesz;
15648                 }
15649
15650                 typeidx = probe->dofpr_xargv;
15651                 typestr = strtab + probe->dofpr_xargv;
15652                 for (k = 0; k < probe->dofpr_xargc; k++) {
15653                         if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15654                                 dtrace_dof_error(dof, "bad "
15655                                     "native argument index");
15656                                 return (-1);
15657                         }
15658
15659                         if (typeidx >= str_sec->dofs_size) {
15660                                 dtrace_dof_error(dof, "bad "
15661                                     "translated argument type");
15662                                 return (-1);
15663                         }
15664
15665                         typesz = strlen(typestr) + 1;
15666                         if (typesz > DTRACE_ARGTYPELEN) {
15667                                 dtrace_dof_error(dof, "translated argument "
15668                                     "type too long");
15669                                 return (-1);
15670                         }
15671
15672                         typeidx += typesz;
15673                         typestr += typesz;
15674                 }
15675         }
15676
15677         return (0);
15678 }
15679
15680 static int
15681 dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
15682 {
15683         dtrace_helpers_t *help;
15684         dtrace_vstate_t *vstate;
15685         dtrace_enabling_t *enab = NULL;
15686         int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15687         uintptr_t daddr = (uintptr_t)dof;
15688
15689         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15690         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15691
15692         if ((help = p->p_dtrace_helpers) == NULL)
15693                 help = dtrace_helpers_create(p);
15694
15695         vstate = &help->dthps_vstate;
15696
15697         if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15698             dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15699                 dtrace_dof_destroy(dof);
15700                 return (rv);
15701         }
15702
15703         /*
15704          * Look for helper providers and validate their descriptions.
15705          */
15706         if (dhp != NULL) {
15707                 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
15708                         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15709                             dof->dofh_secoff + i * dof->dofh_secsize);
15710
15711                         if (sec->dofs_type != DOF_SECT_PROVIDER)
15712                                 continue;
15713
15714                         if (dtrace_helper_provider_validate(dof, sec) != 0) {
15715                                 dtrace_enabling_destroy(enab);
15716                                 dtrace_dof_destroy(dof);
15717                                 return (-1);
15718                         }
15719
15720                         nprovs++;
15721                 }
15722         }
15723
15724         /*
15725          * Now we need to walk through the ECB descriptions in the enabling.
15726          */
15727         for (i = 0; i < enab->dten_ndesc; i++) {
15728                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15729                 dtrace_probedesc_t *desc = &ep->dted_probe;
15730
15731                 /* APPLE NOTE: Darwin employs size bounded string operation. */
15732                 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15733                         continue;
15734
15735                 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15736                         continue;
15737
15738                 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15739                         continue;
15740
15741                 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15742                     ep)) != 0) {
15743                         /*
15744                          * Adding this helper action failed -- we are now going
15745                          * to rip out the entire generation and return failure.
15746                          */
15747                         (void) dtrace_helper_destroygen(p, help->dthps_generation);
15748                         dtrace_enabling_destroy(enab);
15749                         dtrace_dof_destroy(dof);
15750                         return (-1);
15751                 }
15752
15753                 nhelpers++;
15754         }
15755
15756         if (nhelpers < enab->dten_ndesc)
15757                 dtrace_dof_error(dof, "unmatched helpers");
15758
15759         gen = help->dthps_generation++;
15760         dtrace_enabling_destroy(enab);
15761
15762         if (dhp != NULL && nprovs > 0) {
15763                 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15764                 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
15765                         lck_mtx_unlock(&dtrace_lock);
15766                         dtrace_helper_provider_register(p, help, dhp);
15767                         lck_mtx_lock(&dtrace_lock);
15768
15769                         destroy = 0;
15770                 }
15771         }
15772
15773         if (destroy)
15774                 dtrace_dof_destroy(dof);
15775
15776         return (gen);
15777 }
15778
15779 /*
15780  * APPLE NOTE:  DTrace lazy dof implementation
15781  *
15782  * DTrace user static probes (USDT probes) and helper actions are loaded
15783  * in a process by proccessing dof sections. The dof sections are passed
15784  * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15785  * expensive to process dof for a process that will never use it. There
15786  * is a memory cost (allocating the providers/probes), and a cpu cost
15787  * (creating the providers/probes).
15788  *
15789  * To reduce this cost, we use "lazy dof". The normal proceedure for
15790  * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15791  * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15792  * used, each process retains the dof_ioctl_data_t block, instead of
15793  * copying in the data it points to.
15794  *
15795  * The dof_ioctl_data_t blocks are managed as if they were the actual
15796  * processed dof; on fork the block is copied to the child, on exec and
15797  * exit the block is freed.
15798  *
15799  * If the process loads library(s) containing additional dof, the
15800  * new dof_ioctl_data_t is merged with the existing block.
15801  *
15802  * There are a few catches that make this slightly more difficult.
15803  * When dyld registers dof_ioctl_data_t blocks, it expects a unique
15804  * identifier value for each dof in the block. In non-lazy dof terms,
15805  * this is the generation that dof was loaded in. If we hand back
15806  * a UID for a lazy dof, that same UID must be able to unload the
15807  * dof once it has become non-lazy. To meet this requirement, the
15808  * code that loads lazy dof requires that the UID's for dof(s) in
15809  * the lazy dof be sorted, and in ascending order. It is okay to skip
15810  * UID's, I.E., 1 -> 5 -> 6 is legal.
15811  *
15812  * Once a process has become non-lazy, it will stay non-lazy. All
15813  * future dof operations for that process will be non-lazy, even
15814  * if the dof mode transitions back to lazy.
15815  *
15816  * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15817  * That way if the lazy check fails due to transitioning to non-lazy, the
15818  * right thing is done with the newly faulted in dof.
15819  */
15820
15821 /*
15822  * This method is a bit squicky. It must handle:
15823  *
15824  * dof should not be lazy.
15825  * dof should have been handled lazily, but there was an error
15826  * dof was handled lazily, and needs to be freed.
15827  * dof was handled lazily, and must not be freed.
15828  *
15829  *
15830  * Returns EACCESS if dof should be handled non-lazily.
15831  *
15832  * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
15833  *
15834  * If the dofs data is claimed by this method, dofs_claimed will be set.
15835  * Callers should not free claimed dofs.
15836  */
15837 static int
15838 dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
15839 {
15840         ASSERT(p);
15841         ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
15842
15843         int rval = 0;
15844         *dofs_claimed = 0;
15845
15846         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15847
15848         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15849         ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15850
15851         /*
15852          * Any existing helpers force non-lazy behavior.
15853          */
15854         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15855                 dtrace_sprlock(p);
15856
15857                 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15858                 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
15859                 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
15860
15861                 /*
15862                  * Range check...
15863                  */
15864                 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
15865                         dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
15866                         rval = EINVAL;
15867                         goto unlock;
15868                 }
15869
15870                 /*
15871                  * Each dof being added must be assigned a unique generation.
15872                  */
15873                 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
15874                 for (i=0; i<incoming_dofs->dofiod_count; i++) {
15875                         /*
15876                          * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
15877                          */
15878                         ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
15879                         incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
15880                 }
15881
15882
15883                 if (existing_dofs) {
15884                         /*
15885                          * Merge the existing and incoming dofs
15886                          */
15887                         size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
15888                         dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
15889
15890                         bcopy(&existing_dofs->dofiod_helpers[0],
15891                               &merged_dofs->dofiod_helpers[0],
15892                               sizeof(dof_helper_t) * existing_dofs_count);
15893                         bcopy(&incoming_dofs->dofiod_helpers[0],
15894                               &merged_dofs->dofiod_helpers[existing_dofs_count],
15895                               sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
15896
15897                         merged_dofs->dofiod_count = merged_dofs_count;
15898
15899                         kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15900
15901                         p->p_dtrace_lazy_dofs = merged_dofs;
15902                 } else {
15903                         /*
15904                          * Claim the incoming dofs
15905                          */
15906                         *dofs_claimed = 1;
15907                         p->p_dtrace_lazy_dofs = incoming_dofs;
15908                 }
15909
15910 #if DEBUG
15911                 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15912                 for (i=0; i<all_dofs->dofiod_count-1; i++) {
15913                         ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15914                 }
15915 #endif /* DEBUG */
15916
15917 unlock:
15918                 dtrace_sprunlock(p);
15919         } else {
15920                 rval = EACCES;
15921         }
15922
15923         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15924
15925         return rval;
15926 }
15927
15928 /*
15929  * Returns:
15930  *
15931  * EINVAL: lazy dof is enabled, but the requested generation was not found.
15932  * EACCES: This removal needs to be handled non-lazily.
15933  */
15934 static int
15935 dtrace_lazy_dofs_remove(proc_t *p, int generation)
15936 {
15937         int rval = EINVAL;
15938
15939         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15940
15941         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15942         ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15943
15944         /*
15945          * Any existing helpers force non-lazy behavior.
15946          */
15947         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15948                 dtrace_sprlock(p);
15949
15950                 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15951
15952                 if (existing_dofs) {
15953                         int index, existing_dofs_count = existing_dofs->dofiod_count;
15954                         for (index=0; index<existing_dofs_count; index++) {
15955                                 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
15956                                         dof_ioctl_data_t* removed_dofs = NULL;
15957
15958                                         /*
15959                                          * If there is only 1 dof, we'll delete it and swap in NULL.
15960                                          */
15961                                         if (existing_dofs_count > 1) {
15962                                                 int removed_dofs_count = existing_dofs_count - 1;
15963                                                 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
15964
15965                                                 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
15966                                                 removed_dofs->dofiod_count = removed_dofs_count;
15967
15968                                                 /*
15969                                                  * copy the remaining data.
15970                                                  */
15971                                                 if (index > 0) {
15972                                                         bcopy(&existing_dofs->dofiod_helpers[0],
15973                                                               &removed_dofs->dofiod_helpers[0],
15974                                                               index * sizeof(dof_helper_t));
15975                                                 }
15976
15977                                                 if (index < existing_dofs_count-1) {
15978                                                         bcopy(&existing_dofs->dofiod_helpers[index+1],
15979                                                               &removed_dofs->dofiod_helpers[index],
15980                                                               (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
15981                                                 }
15982                                         }
15983
15984                                         kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15985
15986                                         p->p_dtrace_lazy_dofs = removed_dofs;
15987
15988                                         rval = KERN_SUCCESS;
15989
15990                                         break;
15991                                 }
15992                         }
15993
15994 #if DEBUG
15995                         dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15996                         if (all_dofs) {
15997                                 unsigned int i;
15998                                 for (i=0; i<all_dofs->dofiod_count-1; i++) {
15999                                         ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
16000                                 }
16001                         }
16002 #endif
16003
16004                 }
16005                 dtrace_sprunlock(p);
16006         } else {
16007                 rval = EACCES;
16008         }
16009
16010         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16011
16012         return rval;
16013 }
16014
16015 void
16016 dtrace_lazy_dofs_destroy(proc_t *p)
16017 {
16018         lck_rw_lock_shared(&dtrace_dof_mode_lock);
16019         dtrace_sprlock(p);
16020
16021         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16022
16023         dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16024         p->p_dtrace_lazy_dofs = NULL;
16025
16026         dtrace_sprunlock(p);
16027         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16028
16029         if (lazy_dofs) {
16030                 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16031         }
16032 }
16033
16034 static int
16035 dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
16036 {
16037 #pragma unused(ignored)
16038         /*
16039          * Okay to NULL test without taking the sprlock.
16040          */
16041         return p->p_dtrace_lazy_dofs != NULL;
16042 }
16043
16044 static void
16045 dtrace_lazy_dofs_process(proc_t *p) {
16046         /*
16047          * It is possible this process may exit during our attempt to
16048          * fault in the dof. We could fix this by holding locks longer,
16049          * but the errors are benign.
16050          */
16051         dtrace_sprlock(p);
16052
16053
16054         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16055         ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
16056
16057         dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16058         p->p_dtrace_lazy_dofs = NULL;
16059
16060         dtrace_sprunlock(p);
16061         lck_mtx_lock(&dtrace_meta_lock);
16062         /*
16063          * Process each dof_helper_t
16064          */
16065         if (lazy_dofs != NULL) {
16066                 unsigned int i;
16067                 int rval;
16068
16069                 for (i=0; i<lazy_dofs->dofiod_count; i++) {
16070                         /*
16071                          * When loading lazy dof, we depend on the generations being sorted in ascending order.
16072                          */
16073                         ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
16074
16075                         dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
16076
16077                         /*
16078                          * We stored the generation in dofhp_dof. Save it, and restore the original value.
16079                          */
16080                         int generation = dhp->dofhp_dof;
16081                         dhp->dofhp_dof = dhp->dofhp_addr;
16082
16083                         dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
16084
16085                         if (dof != NULL) {
16086                                 dtrace_helpers_t *help;
16087
16088                                 lck_mtx_lock(&dtrace_lock);
16089
16090                                 /*
16091                                  * This must be done with the dtrace_lock held
16092                                  */
16093                                 if ((help = p->p_dtrace_helpers) == NULL)
16094                                         help = dtrace_helpers_create(p);
16095
16096                                 /*
16097                                  * If the generation value has been bumped, someone snuck in
16098                                  * when we released the dtrace lock. We have to dump this generation,
16099                                  * there is no safe way to load it.
16100                                  */
16101                                 if (help->dthps_generation <= generation) {
16102                                         help->dthps_generation = generation;
16103
16104                                         /*
16105                                          * dtrace_helper_slurp() takes responsibility for the dof --
16106                                          * it may free it now or it may save it and free it later.
16107                                          */
16108                                         if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
16109                                                 dtrace_dof_error(NULL, "returned value did not match expected generation");
16110                                         }
16111                                 }
16112
16113                                 lck_mtx_unlock(&dtrace_lock);
16114                         }
16115                 }
16116                 lck_mtx_unlock(&dtrace_meta_lock);
16117                 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16118         } else {
16119                 lck_mtx_unlock(&dtrace_meta_lock);
16120         }
16121 }
16122
16123 static int
16124 dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
16125 {
16126 #pragma unused(ignored)
16127
16128         dtrace_lazy_dofs_process(p);
16129
16130         return PROC_RETURNED;
16131 }
16132
16133 #define DTRACE_LAZY_DOFS_DUPLICATED 1
16134
16135 static int
16136 dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
16137 {
16138         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
16139         LCK_MTX_ASSERT(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16140         LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16141
16142         lck_rw_lock_shared(&dtrace_dof_mode_lock);
16143         dtrace_sprlock(parent);
16144
16145         /*
16146          * We need to make sure that the transition to lazy dofs -> helpers
16147          * was atomic for our parent
16148          */
16149         ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
16150         /*
16151          * In theory we should hold the child sprlock, but this is safe...
16152          */
16153         ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
16154
16155         dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
16156         dof_ioctl_data_t* child_dofs = NULL;
16157         if (parent_dofs) {
16158                 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
16159                 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
16160                 bcopy(parent_dofs, child_dofs, parent_dofs_size);
16161         }
16162
16163         dtrace_sprunlock(parent);
16164
16165         if (child_dofs) {
16166                 dtrace_sprlock(child);
16167                 child->p_dtrace_lazy_dofs = child_dofs;
16168                 dtrace_sprunlock(child);
16169                 /**
16170                  * We process the DOF at this point if the mode is set to
16171                  * LAZY_OFF. This can happen if DTrace is still processing the
16172                  * DOF of other process (which can happen because the
16173                  * protected pager can have a huge latency)
16174                  * but has not processed our parent yet
16175                  */
16176                 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
16177                         dtrace_lazy_dofs_process(child);
16178                 }
16179                 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16180
16181                 return DTRACE_LAZY_DOFS_DUPLICATED;
16182         }
16183         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16184
16185         return 0;
16186 }
16187
16188 static dtrace_helpers_t *
16189 dtrace_helpers_create(proc_t *p)
16190 {
16191         dtrace_helpers_t *help;
16192
16193         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
16194         ASSERT(p->p_dtrace_helpers == NULL);
16195
16196         help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16197         help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16198             DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16199
16200         p->p_dtrace_helpers = help;
16201         dtrace_helpers++;
16202
16203         return (help);
16204 }
16205
16206 static void
16207 dtrace_helpers_destroy(proc_t* p)
16208 {
16209         dtrace_helpers_t *help;
16210         dtrace_vstate_t *vstate;
16211         uint_t i;
16212
16213         lck_mtx_lock(&dtrace_meta_lock);
16214         lck_mtx_lock(&dtrace_lock);
16215
16216         ASSERT(p->p_dtrace_helpers != NULL);
16217         ASSERT(dtrace_helpers > 0);
16218
16219         help = p->p_dtrace_helpers;
16220         vstate = &help->dthps_vstate;
16221
16222         /*
16223          * We're now going to lose the help from this process.
16224          */
16225         p->p_dtrace_helpers = NULL;
16226         dtrace_sync();
16227
16228         /*
16229          * Destory the helper actions.
16230          */
16231         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16232                 dtrace_helper_action_t *h, *next;
16233
16234                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16235                         next = h->dtha_next;
16236                         dtrace_helper_action_destroy(h, vstate);
16237                         h = next;
16238                 }
16239         }
16240
16241         lck_mtx_unlock(&dtrace_lock);
16242
16243         /*
16244          * Destroy the helper providers.
16245          */
16246         if (help->dthps_maxprovs > 0) {
16247                 if (dtrace_meta_pid != NULL) {
16248                         ASSERT(dtrace_deferred_pid == NULL);
16249
16250                         for (i = 0; i < help->dthps_nprovs; i++) {
16251                                 dtrace_helper_provider_remove(
16252                                     &help->dthps_provs[i]->dthp_prov, p);
16253                         }
16254                 } else {
16255                         lck_mtx_lock(&dtrace_lock);
16256                         ASSERT(help->dthps_deferred == 0 ||
16257                             help->dthps_next != NULL ||
16258                             help->dthps_prev != NULL ||
16259                             help == dtrace_deferred_pid);
16260
16261                         /*
16262                          * Remove the helper from the deferred list.
16263                          */
16264                         if (help->dthps_next != NULL)
16265                                 help->dthps_next->dthps_prev = help->dthps_prev;
16266                         if (help->dthps_prev != NULL)
16267                                 help->dthps_prev->dthps_next = help->dthps_next;
16268                         if (dtrace_deferred_pid == help) {
16269                                 dtrace_deferred_pid = help->dthps_next;
16270                                 ASSERT(help->dthps_prev == NULL);
16271                         }
16272
16273                         lck_mtx_unlock(&dtrace_lock);
16274                 }
16275
16276
16277                 for (i = 0; i < help->dthps_nprovs; i++) {
16278                         dtrace_helper_provider_destroy(help->dthps_provs[i]);
16279                 }
16280
16281                 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16282                     sizeof (dtrace_helper_provider_t *));
16283         }
16284
16285         lck_mtx_lock(&dtrace_lock);
16286
16287         dtrace_vstate_fini(&help->dthps_vstate);
16288         kmem_free(help->dthps_actions,
16289             sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16290         kmem_free(help, sizeof (dtrace_helpers_t));
16291
16292         --dtrace_helpers;
16293         lck_mtx_unlock(&dtrace_lock);
16294         lck_mtx_unlock(&dtrace_meta_lock);
16295 }
16296
16297 static void
16298 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16299 {
16300         dtrace_helpers_t *help, *newhelp;
16301         dtrace_helper_action_t *helper, *new, *last;
16302         dtrace_difo_t *dp;
16303         dtrace_vstate_t *vstate;
16304         uint_t i;
16305         int j, sz, hasprovs = 0;
16306
16307         lck_mtx_lock(&dtrace_meta_lock);
16308         lck_mtx_lock(&dtrace_lock);
16309         ASSERT(from->p_dtrace_helpers != NULL);
16310         ASSERT(dtrace_helpers > 0);
16311
16312         help = from->p_dtrace_helpers;
16313         newhelp = dtrace_helpers_create(to);
16314         ASSERT(to->p_dtrace_helpers != NULL);
16315
16316         newhelp->dthps_generation = help->dthps_generation;
16317         vstate = &newhelp->dthps_vstate;
16318
16319         /*
16320          * Duplicate the helper actions.
16321          */
16322         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16323                 if ((helper = help->dthps_actions[i]) == NULL)
16324                         continue;
16325
16326                 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16327                         new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16328                             KM_SLEEP);
16329                         new->dtha_generation = helper->dtha_generation;
16330
16331                         if ((dp = helper->dtha_predicate) != NULL) {
16332                                 dp = dtrace_difo_duplicate(dp, vstate);
16333                                 new->dtha_predicate = dp;
16334                         }
16335
16336                         new->dtha_nactions = helper->dtha_nactions;
16337                         sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16338                         new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16339
16340                         for (j = 0; j < new->dtha_nactions; j++) {
16341                                 dtrace_difo_t *dpj = helper->dtha_actions[j];
16342
16343                                 ASSERT(dpj != NULL);
16344                                 dpj = dtrace_difo_duplicate(dpj, vstate);
16345                                 new->dtha_actions[j] = dpj;
16346                         }
16347
16348                         if (last != NULL) {
16349                                 last->dtha_next = new;
16350                         } else {
16351                                 newhelp->dthps_actions[i] = new;
16352                         }
16353
16354                         last = new;
16355                 }
16356         }
16357
16358         /*
16359          * Duplicate the helper providers and register them with the
16360          * DTrace framework.
16361          */
16362         if (help->dthps_nprovs > 0) {
16363                 newhelp->dthps_nprovs = help->dthps_nprovs;
16364                 newhelp->dthps_maxprovs = help->dthps_nprovs;
16365                 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16366                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16367                 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16368                         newhelp->dthps_provs[i] = help->dthps_provs[i];
16369                         newhelp->dthps_provs[i]->dthp_ref++;
16370                 }
16371
16372                 hasprovs = 1;
16373         }
16374
16375         lck_mtx_unlock(&dtrace_lock);
16376
16377         if (hasprovs)
16378                 dtrace_helper_provider_register(to, newhelp, NULL);
16379
16380         lck_mtx_unlock(&dtrace_meta_lock);
16381 }
16382
16383 /**
16384  * DTrace Process functions
16385  */
16386
16387 void
16388 dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn)
16389 {
16390         /*
16391          * This code applies to new processes who are copying the task
16392          * and thread state and address spaces of their parent process.
16393          */
16394         if (!spawn) {
16395                 /*
16396                  * APPLE NOTE: Solaris does a sprlock() and drops the
16397                  * proc_lock here. We're cheating a bit and only taking
16398                  * the p_dtrace_sprlock lock. A full sprlock would
16399                  * task_suspend the parent.
16400                  */
16401                 dtrace_sprlock(parent_proc);
16402
16403                 /*
16404                  * Remove all DTrace tracepoints from the child process. We
16405                  * need to do this _before_ duplicating USDT providers since
16406                  * any associated probes may be immediately enabled.
16407                  */
16408                 if (parent_proc->p_dtrace_count > 0) {
16409                         dtrace_fasttrap_fork(parent_proc, child_proc);
16410                 }
16411
16412                 dtrace_sprunlock(parent_proc);
16413
16414                 /*
16415                  * Duplicate any lazy dof(s). This must be done while NOT
16416                  * holding the parent sprlock! Lock ordering is
16417                  * dtrace_dof_mode_lock, then sprlock.  It is imperative we
16418                  * always call dtrace_lazy_dofs_duplicate, rather than null
16419                  * check and call if !NULL. If we NULL test, during lazy dof
16420                  * faulting we can race with the faulting code and proceed
16421                  * from here to beyond the helpers copy. The lazy dof
16422                  * faulting will then fail to copy the helpers to the child
16423                  * process. We return if we duplicated lazy dofs as a process
16424                  * can only have one at the same time to avoid a race between
16425                  * a dtrace client and dtrace_proc_fork where a process would
16426                  * end up with both lazy dofs and helpers.
16427                  */
16428                 if (dtrace_lazy_dofs_duplicate(parent_proc, child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) {
16429                         return;
16430                 }
16431
16432                 /*
16433                  * Duplicate any helper actions and providers if they haven't
16434                  * already.
16435                  */
16436 #if !defined(__APPLE__)
16437                  /*
16438                  * The SFORKING
16439                  * we set above informs the code to enable USDT probes that
16440                  * sprlock() may fail because the child is being forked.
16441                  */
16442 #endif
16443                 /*
16444                  * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
16445                  * never fails to find the child. We do not set SFORKING.
16446                  */
16447                 if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
16448                         (*dtrace_helpers_fork)(parent_proc, child_proc);
16449                 }
16450         }
16451 }
16452
16453 void
16454 dtrace_proc_exec(proc_t *p)
16455 {
16456         /*
16457          * Invalidate any predicate evaluation already cached for this thread by DTrace.
16458          * That's because we've just stored to p_comm and DTrace refers to that when it
16459          * evaluates the "execname" special variable. uid and gid may have changed as well.
16460          */
16461         dtrace_set_thread_predcache(current_thread(), 0);
16462
16463         /*
16464          * Free any outstanding lazy dof entries. It is imperative we
16465          * always call dtrace_lazy_dofs_destroy, rather than null check
16466          * and call if !NULL. If we NULL test, during lazy dof faulting
16467          * we can race with the faulting code and proceed from here to
16468          * beyond the helpers cleanup. The lazy dof faulting will then
16469          * install new helpers which no longer belong to this process!
16470          */
16471         dtrace_lazy_dofs_destroy(p);
16472
16473
16474         /*
16475          * Clean up any DTrace helpers for the process.
16476          */
16477         if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
16478                 (*dtrace_helpers_cleanup)(p);
16479         }
16480
16481         /*
16482          * Cleanup the DTrace provider associated with this process.
16483          */
16484         proc_lock(p);
16485         if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
16486                 (*dtrace_fasttrap_exec_ptr)(p);
16487         }
16488         proc_unlock(p);
16489 }
16490
16491 void
16492 dtrace_proc_exit(proc_t *p)
16493 {
16494         /*
16495          * Free any outstanding lazy dof entries. It is imperative we
16496          * always call dtrace_lazy_dofs_destroy, rather than null check
16497          * and call if !NULL. If we NULL test, during lazy dof faulting
16498          * we can race with the faulting code and proceed from here to
16499          * beyond the helpers cleanup. The lazy dof faulting will then
16500          * install new helpers which will never be cleaned up, and leak.
16501          */
16502         dtrace_lazy_dofs_destroy(p);
16503
16504         /*
16505          * Clean up any DTrace helper actions or probes for the process.
16506          */
16507         if (p->p_dtrace_helpers != NULL) {
16508                 (*dtrace_helpers_cleanup)(p);
16509         }
16510
16511         /*
16512          * Clean up any DTrace probes associated with this process.
16513          */
16514         /*
16515          * APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(),
16516          * call this after dtrace_helpers_cleanup()
16517          */
16518         proc_lock(p);
16519         if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) {
16520                 (*dtrace_fasttrap_exit_ptr)(p);
16521         }
16522         proc_unlock(p);
16523 }
16524
16525 /*
16526  * DTrace Hook Functions
16527  */
16528
16529 /*
16530  * APPLE NOTE:  dtrace_modctl_* routines for kext support.
16531  * Used to manipulate the modctl list within dtrace xnu.
16532  */
16533
16534 modctl_t *dtrace_modctl_list;
16535
16536 static void
16537 dtrace_modctl_add(struct modctl * newctl)
16538 {
16539         struct modctl *nextp, *prevp;
16540
16541         ASSERT(newctl != NULL);
16542         LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16543
16544         // Insert new module at the front of the list,
16545
16546         newctl->mod_next = dtrace_modctl_list;
16547         dtrace_modctl_list = newctl;
16548
16549         /*
16550          * If a module exists with the same name, then that module
16551          * must have been unloaded with enabled probes. We will move
16552          * the unloaded module to the new module's stale chain and
16553          * then stop traversing the list.
16554          */
16555
16556         prevp = newctl;
16557         nextp = newctl->mod_next;
16558
16559         while (nextp != NULL) {
16560                 if (nextp->mod_loaded) {
16561                         /* This is a loaded module. Keep traversing. */
16562                         prevp = nextp;
16563                         nextp = nextp->mod_next;
16564                         continue;
16565                 }
16566                 else {
16567                         /* Found an unloaded module */
16568                         if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
16569                                 /* Names don't match. Keep traversing. */
16570                                 prevp = nextp;
16571                                 nextp = nextp->mod_next;
16572                                 continue;
16573                         }
16574                         else {
16575                                 /* We found a stale entry, move it. We're done. */
16576                                 prevp->mod_next = nextp->mod_next;
16577                                 newctl->mod_stale = nextp;
16578                                 nextp->mod_next = NULL;
16579                                 break;
16580                         }
16581                 }
16582         }
16583 }
16584
16585 static modctl_t *
16586 dtrace_modctl_lookup(struct kmod_info * kmod)
16587 {
16588     LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16589
16590     struct modctl * ctl;
16591
16592     for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
16593         if (ctl->mod_id == kmod->id)
16594             return(ctl);
16595     }
16596     return (NULL);
16597 }
16598
16599 /*
16600  * This routine is called from dtrace_module_unloaded().
16601  * It removes a modctl structure and its stale chain
16602  * from the kext shadow list.
16603  */
16604 static void
16605 dtrace_modctl_remove(struct modctl * ctl)
16606 {
16607         ASSERT(ctl != NULL);
16608         LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16609         modctl_t *prevp, *nextp, *curp;
16610
16611         // Remove stale chain first
16612         for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
16613                 nextp = curp->mod_stale;
16614                 /* There should NEVER be user symbols allocated at this point */
16615                 ASSERT(curp->mod_user_symbols == NULL);
16616                 kmem_free(curp, sizeof(modctl_t));
16617         }
16618
16619         prevp = NULL;
16620         curp = dtrace_modctl_list;
16621
16622         while (curp != ctl) {
16623                 prevp = curp;
16624                 curp = curp->mod_next;
16625         }
16626
16627         if (prevp != NULL) {
16628                 prevp->mod_next = ctl->mod_next;
16629         }
16630         else {
16631                 dtrace_modctl_list = ctl->mod_next;
16632         }
16633
16634         /* There should NEVER be user symbols allocated at this point */
16635         ASSERT(ctl->mod_user_symbols == NULL);
16636
16637         kmem_free (ctl, sizeof(modctl_t));
16638 }
16639
16640 /*
16641  * APPLE NOTE: The kext loader will call dtrace_module_loaded
16642  * when the kext is loaded in memory, but before calling the
16643  * kext's start routine.
16644  *
16645  * Return 0 on success
16646  * Return -1 on failure
16647  */
16648
16649 static int
16650 dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16651 {
16652         dtrace_provider_t *prv;
16653
16654         /*
16655          * If kernel symbols have been disabled, return immediately
16656          * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16657          */
16658         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16659                 return 0;
16660
16661         struct modctl *ctl = NULL;
16662         if (!kmod || kmod->address == 0 || kmod->size == 0)
16663                 return(-1);
16664
16665         lck_mtx_lock(&dtrace_provider_lock);
16666         lck_mtx_lock(&mod_lock);
16667
16668         /*
16669          * Have we seen this kext before?
16670          */
16671
16672         ctl = dtrace_modctl_lookup(kmod);
16673
16674         if (ctl != NULL) {
16675                 /* bail... we already have this kext in the modctl list */
16676                 lck_mtx_unlock(&mod_lock);
16677                 lck_mtx_unlock(&dtrace_provider_lock);
16678                 if (dtrace_err_verbose)
16679                         cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16680                 return(-1);
16681         }
16682         else {
16683                 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16684                 if (ctl == NULL) {
16685                         if (dtrace_err_verbose)
16686                                 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16687                         lck_mtx_unlock(&mod_lock);
16688                         lck_mtx_unlock(&dtrace_provider_lock);
16689                         return (-1);
16690                 }
16691                 ctl->mod_next = NULL;
16692                 ctl->mod_stale = NULL;
16693                 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16694                 ctl->mod_loadcnt = kmod->id;
16695                 ctl->mod_nenabled = 0;
16696                 ctl->mod_address  = kmod->address;
16697                 ctl->mod_size = kmod->size;
16698                 ctl->mod_id = kmod->id;
16699                 ctl->mod_loaded = 1;
16700                 ctl->mod_flags = 0;
16701                 ctl->mod_user_symbols = NULL;
16702
16703                 /*
16704                  * Find the UUID for this module, if it has one
16705                  */
16706                 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16707                 struct load_command* load_cmd = (struct load_command *)&header[1];
16708                 uint32_t i;
16709                 for (i = 0; i < header->ncmds; i++) {
16710                         if (load_cmd->cmd == LC_UUID) {
16711                                 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16712                                 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16713                                 ctl->mod_flags |= MODCTL_HAS_UUID;
16714                                 break;
16715                         }
16716                         load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16717                 }
16718
16719                 if (ctl->mod_address == g_kernel_kmod_info.address) {
16720                         ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
16721                         memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid));
16722                 }
16723                 /*
16724                  * Static kexts have a UUID that is not used for symbolication, as all their
16725                  * symbols are in kernel
16726                  */
16727                 else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
16728                         memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid));
16729                         ctl->mod_flags |= MODCTL_IS_STATIC_KEXT;
16730                 }
16731         }
16732         dtrace_modctl_add(ctl);
16733
16734         /*
16735          * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16736          */
16737         lck_mtx_lock(&dtrace_lock);
16738
16739         /*
16740          * DTrace must decide if it will instrument modules lazily via
16741          * userspace symbols (default mode), or instrument immediately via
16742          * kernel symbols (non-default mode)
16743          *
16744          * When in default/lazy mode, DTrace will only support modules
16745          * built with a valid UUID.
16746          *
16747          * Overriding the default can be done explicitly in one of
16748          * the following two ways.
16749          *
16750          * A module can force symbols from kernel space using the plist key,
16751          * OSBundleForceDTraceInit (see kmod.h).  If this per kext state is set,
16752          * we fall through and instrument this module now.
16753          *
16754          * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16755          * from kernel space (see dtrace_impl.h).  If this system state is set
16756          * to a non-userspace mode, we fall through and instrument the module now.
16757          */
16758
16759         if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16760             (!(flag & KMOD_DTRACE_FORCE_INIT)))
16761         {
16762                 /* We will instrument the module lazily -- this is the default */
16763                 lck_mtx_unlock(&dtrace_lock);
16764                 lck_mtx_unlock(&mod_lock);
16765                 lck_mtx_unlock(&dtrace_provider_lock);
16766                 return 0;
16767         }
16768
16769         /* We will instrument the module immediately using kernel symbols */
16770         ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
16771
16772         lck_mtx_unlock(&dtrace_lock);
16773
16774         /*
16775          * We're going to call each providers per-module provide operation
16776          * specifying only this module.
16777          */
16778         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16779                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16780
16781         /*
16782          * APPLE NOTE: The contract with the kext loader is that once this function
16783          * has completed, it may delete kernel symbols at will.
16784          * We must set this while still holding the mod_lock.
16785          */
16786         ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16787
16788         lck_mtx_unlock(&mod_lock);
16789         lck_mtx_unlock(&dtrace_provider_lock);
16790
16791         /*
16792          * If we have any retained enablings, we need to match against them.
16793          * Enabling probes requires that cpu_lock be held, and we cannot hold
16794          * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16795          * module.  (In particular, this happens when loading scheduling
16796          * classes.)  So if we have any retained enablings, we need to dispatch
16797          * our task queue to do the match for us.
16798          */
16799         lck_mtx_lock(&dtrace_lock);
16800
16801         if (dtrace_retained == NULL) {
16802                 lck_mtx_unlock(&dtrace_lock);
16803                 return 0;
16804         }
16805
16806         /* APPLE NOTE!
16807          *
16808          * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16809          * holds it for any reason. Thus the comment above is invalid, we can directly invoke
16810          * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16811          * the delay call as well.
16812          */
16813         lck_mtx_unlock(&dtrace_lock);
16814
16815         dtrace_enabling_matchall();
16816
16817         return 0;
16818 }
16819
16820 /*
16821  * Return 0 on success
16822  * Return -1 on failure
16823  */
16824 static int
16825 dtrace_module_unloaded(struct kmod_info *kmod)
16826 {
16827         dtrace_probe_t template, *probe, *first, *next;
16828         dtrace_provider_t *prov;
16829         struct modctl *ctl = NULL;
16830         struct modctl *syncctl = NULL;
16831         struct modctl *nextsyncctl = NULL;
16832         int syncmode = 0;
16833
16834         lck_mtx_lock(&dtrace_provider_lock);
16835         lck_mtx_lock(&mod_lock);
16836         lck_mtx_lock(&dtrace_lock);
16837
16838         if (kmod == NULL) {
16839             syncmode = 1;
16840         }
16841         else {
16842             ctl = dtrace_modctl_lookup(kmod);
16843             if (ctl == NULL)
16844             {
16845                 lck_mtx_unlock(&dtrace_lock);
16846                 lck_mtx_unlock(&mod_lock);
16847                 lck_mtx_unlock(&dtrace_provider_lock);
16848                 return (-1);
16849             }
16850             ctl->mod_loaded = 0;
16851             ctl->mod_address = 0;
16852             ctl->mod_size = 0;
16853         }
16854
16855         if (dtrace_bymod == NULL) {
16856                 /*
16857                  * The DTrace module is loaded (obviously) but not attached;
16858                  * we don't have any work to do.
16859                  */
16860                  if (ctl != NULL)
16861                          (void)dtrace_modctl_remove(ctl);
16862                  lck_mtx_unlock(&dtrace_lock);
16863                  lck_mtx_unlock(&mod_lock);
16864                  lck_mtx_unlock(&dtrace_provider_lock);
16865                  return(0);
16866         }
16867
16868         /* Syncmode set means we target and traverse entire modctl list. */
16869         if (syncmode)
16870             nextsyncctl = dtrace_modctl_list;
16871
16872 syncloop:
16873         if (syncmode)
16874         {
16875             /* find a stale modctl struct */
16876             for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
16877                 if (syncctl->mod_address == 0)
16878                     break;
16879             }
16880             if (syncctl==NULL)
16881             {
16882                 /* We have no more work to do */
16883                 lck_mtx_unlock(&dtrace_lock);
16884                 lck_mtx_unlock(&mod_lock);
16885                 lck_mtx_unlock(&dtrace_provider_lock);
16886                 return(0);
16887             }
16888             else {
16889                 /* keep track of next syncctl in case this one is removed */
16890                 nextsyncctl = syncctl->mod_next;
16891                 ctl = syncctl;
16892             }
16893         }
16894
16895         template.dtpr_mod = ctl->mod_modname;
16896
16897         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16898             probe != NULL; probe = probe->dtpr_nextmod) {
16899                 if (probe->dtpr_ecb != NULL) {
16900                         /*
16901                          * This shouldn't _actually_ be possible -- we're
16902                          * unloading a module that has an enabled probe in it.
16903                          * (It's normally up to the provider to make sure that
16904                          * this can't happen.)  However, because dtps_enable()
16905                          * doesn't have a failure mode, there can be an
16906                          * enable/unload race.  Upshot:  we don't want to
16907                          * assert, but we're not going to disable the
16908                          * probe, either.
16909                          */
16910
16911
16912                         if (syncmode) {
16913                             /* We're syncing, let's look at next in list */
16914                             goto syncloop;
16915                         }
16916
16917                         lck_mtx_unlock(&dtrace_lock);
16918                         lck_mtx_unlock(&mod_lock);
16919                         lck_mtx_unlock(&dtrace_provider_lock);
16920
16921                         if (dtrace_err_verbose) {
16922                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16923                                     "enabled probes", ctl->mod_modname);
16924                         }
16925                         return(-1);
16926                 }
16927         }
16928
16929         probe = first;
16930
16931         for (first = NULL; probe != NULL; probe = next) {
16932                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16933
16934                 dtrace_probes[probe->dtpr_id - 1] = NULL;
16935                 probe->dtpr_provider->dtpv_probe_count--;
16936
16937                 next = probe->dtpr_nextmod;
16938                 dtrace_hash_remove(dtrace_byprov, probe);
16939                 dtrace_hash_remove(dtrace_bymod, probe);
16940                 dtrace_hash_remove(dtrace_byfunc, probe);
16941                 dtrace_hash_remove(dtrace_byname, probe);
16942
16943                 if (first == NULL) {
16944                         first = probe;
16945                         probe->dtpr_nextmod = NULL;
16946                 } else {
16947                         probe->dtpr_nextmod = first;
16948                         first = probe;
16949                 }
16950         }
16951
16952         /*
16953          * We've removed all of the module's probes from the hash chains and
16954          * from the probe array.  Now issue a dtrace_sync() to be sure that
16955          * everyone has cleared out from any probe array processing.
16956          */
16957         dtrace_sync();
16958
16959         for (probe = first; probe != NULL; probe = first) {
16960                 first = probe->dtpr_nextmod;
16961                 prov = probe->dtpr_provider;
16962                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16963                     probe->dtpr_arg);
16964                 dtrace_strunref(probe->dtpr_mod);
16965                 dtrace_strunref(probe->dtpr_func);
16966                 dtrace_strunref(probe->dtpr_name);
16967                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16968
16969                 zfree(dtrace_probe_t_zone, probe);
16970         }
16971
16972         dtrace_modctl_remove(ctl);
16973
16974         if (syncmode)
16975             goto syncloop;
16976
16977         lck_mtx_unlock(&dtrace_lock);
16978         lck_mtx_unlock(&mod_lock);
16979         lck_mtx_unlock(&dtrace_provider_lock);
16980
16981         return(0);
16982 }
16983
16984 void
16985 dtrace_suspend(void)
16986 {
16987         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16988 }
16989
16990 void
16991 dtrace_resume(void)
16992 {
16993         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16994 }
16995
16996 static int
16997 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16998 {
16999         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17000         lck_mtx_lock(&dtrace_lock);
17001
17002         switch (what) {
17003         case CPU_CONFIG: {
17004                 dtrace_state_t *state;
17005                 dtrace_optval_t *opt, rs, c;
17006
17007                 /*
17008                  * For now, we only allocate a new buffer for anonymous state.
17009                  */
17010                 if ((state = dtrace_anon.dta_state) == NULL)
17011                         break;
17012
17013                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
17014                         break;
17015
17016                 opt = state->dts_options;
17017                 c = opt[DTRACEOPT_CPU];
17018
17019                 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
17020                         break;
17021
17022                 /*
17023                  * Regardless of what the actual policy is, we're going to
17024                  * temporarily set our resize policy to be manual.  We're
17025                  * also going to temporarily set our CPU option to denote
17026                  * the newly configured CPU.
17027                  */
17028                 rs = opt[DTRACEOPT_BUFRESIZE];
17029                 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
17030                 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
17031
17032                 (void) dtrace_state_buffers(state);
17033
17034                 opt[DTRACEOPT_BUFRESIZE] = rs;
17035                 opt[DTRACEOPT_CPU] = c;
17036
17037                 break;
17038         }
17039
17040         case CPU_UNCONFIG:
17041                 /*
17042                  * We don't free the buffer in the CPU_UNCONFIG case.  (The
17043                  * buffer will be freed when the consumer exits.)
17044                  */
17045                 break;
17046
17047         default:
17048                 break;
17049         }
17050
17051         lck_mtx_unlock(&dtrace_lock);
17052         return (0);
17053 }
17054
17055 static void
17056 dtrace_cpu_setup_initial(processorid_t cpu)
17057 {
17058         (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
17059 }
17060
17061 static void
17062 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
17063 {
17064         if (dtrace_toxranges >= dtrace_toxranges_max) {
17065                 int osize, nsize;
17066                 dtrace_toxrange_t *range;
17067
17068                 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17069
17070                 if (osize == 0) {
17071                         ASSERT(dtrace_toxrange == NULL);
17072                         ASSERT(dtrace_toxranges_max == 0);
17073                         dtrace_toxranges_max = 1;
17074                 } else {
17075                         dtrace_toxranges_max <<= 1;
17076                 }
17077
17078                 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17079                 range = kmem_zalloc(nsize, KM_SLEEP);
17080
17081                 if (dtrace_toxrange != NULL) {
17082                         ASSERT(osize != 0);
17083                         bcopy(dtrace_toxrange, range, osize);
17084                         kmem_free(dtrace_toxrange, osize);
17085                 }
17086
17087                 dtrace_toxrange = range;
17088         }
17089
17090         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
17091         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
17092
17093         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
17094         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
17095         dtrace_toxranges++;
17096 }
17097
17098 /*
17099  * DTrace Driver Cookbook Functions
17100  */
17101 /*ARGSUSED*/
17102 static int
17103 dtrace_attach(dev_info_t *devi)
17104 {
17105         dtrace_provider_id_t id;
17106         dtrace_state_t *state = NULL;
17107         dtrace_enabling_t *enab;
17108
17109         lck_mtx_lock(&cpu_lock);
17110         lck_mtx_lock(&dtrace_provider_lock);
17111         lck_mtx_lock(&dtrace_lock);
17112
17113         /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
17114         dtrace_devi = devi;
17115
17116         dtrace_modload = dtrace_module_loaded;
17117         dtrace_modunload = dtrace_module_unloaded;
17118         dtrace_cpu_init = dtrace_cpu_setup_initial;
17119         dtrace_helpers_cleanup = dtrace_helpers_destroy;
17120         dtrace_helpers_fork = dtrace_helpers_duplicate;
17121         dtrace_cpustart_init = dtrace_suspend;
17122         dtrace_cpustart_fini = dtrace_resume;
17123         dtrace_debugger_init = dtrace_suspend;
17124         dtrace_debugger_fini = dtrace_resume;
17125
17126         register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17127
17128         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17129
17130         dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
17131             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
17132
17133         dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
17134             sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
17135             NULL, NULL, NULL, NULL, NULL, 0);
17136
17137         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17138
17139         dtrace_nprobes = dtrace_nprobes_default;
17140         dtrace_probes = kmem_zalloc(sizeof(dtrace_probe_t*) * dtrace_nprobes,
17141             KM_SLEEP);
17142
17143         dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
17144             0, /* unused */
17145             offsetof(dtrace_probe_t, dtpr_nextprov),
17146             offsetof(dtrace_probe_t, dtpr_prevprov));
17147
17148         dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset,
17149             offsetof(dtrace_probe_t, dtpr_mod),
17150             offsetof(dtrace_probe_t, dtpr_nextmod),
17151             offsetof(dtrace_probe_t, dtpr_prevmod));
17152
17153         dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset,
17154             offsetof(dtrace_probe_t, dtpr_func),
17155             offsetof(dtrace_probe_t, dtpr_nextfunc),
17156             offsetof(dtrace_probe_t, dtpr_prevfunc));
17157
17158         dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset,
17159             offsetof(dtrace_probe_t, dtpr_name),
17160             offsetof(dtrace_probe_t, dtpr_nextname),
17161             offsetof(dtrace_probe_t, dtpr_prevname));
17162
17163         if (dtrace_retain_max < 1) {
17164                 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
17165                     "setting to 1", dtrace_retain_max);
17166                 dtrace_retain_max = 1;
17167         }
17168
17169         /*
17170          * Now discover our toxic ranges.
17171          */
17172         dtrace_toxic_ranges(dtrace_toxrange_add);
17173
17174         /*
17175          * Before we register ourselves as a provider to our own framework,
17176          * we would like to assert that dtrace_provider is NULL -- but that's
17177          * not true if we were loaded as a dependency of a DTrace provider.
17178          * Once we've registered, we can assert that dtrace_provider is our
17179          * pseudo provider.
17180          */
17181         (void) dtrace_register("dtrace", &dtrace_provider_attr,
17182             DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
17183
17184         ASSERT(dtrace_provider != NULL);
17185         ASSERT((dtrace_provider_id_t)dtrace_provider == id);
17186
17187 #if defined (__x86_64__)
17188         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17189             dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
17190         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17191             dtrace_provider, NULL, NULL, "END", 0, NULL);
17192         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17193             dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
17194 #elif (defined(__arm__) || defined(__arm64__))
17195         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17196             dtrace_provider, NULL, NULL, "BEGIN", 2, NULL);
17197         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17198             dtrace_provider, NULL, NULL, "END", 1, NULL);
17199         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17200             dtrace_provider, NULL, NULL, "ERROR", 4, NULL);
17201 #else
17202 #error Unknown Architecture
17203 #endif
17204
17205         dtrace_anon_property();
17206         lck_mtx_unlock(&cpu_lock);
17207
17208         /*
17209          * If DTrace helper tracing is enabled, we need to allocate the
17210          * trace buffer and initialize the values.
17211          */
17212         if (dtrace_helptrace_enabled) {
17213                 ASSERT(dtrace_helptrace_buffer == NULL);
17214                 dtrace_helptrace_buffer =
17215                     kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
17216                 dtrace_helptrace_next = 0;
17217         }
17218
17219         /*
17220          * If there are already providers, we must ask them to provide their
17221          * probes, and then match any anonymous enabling against them.  Note
17222          * that there should be no other retained enablings at this time:
17223          * the only retained enablings at this time should be the anonymous
17224          * enabling.
17225          */
17226         if (dtrace_anon.dta_enabling != NULL) {
17227                 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
17228
17229                 /*
17230                  * APPLE NOTE: if handling anonymous dof, switch symbol modes.
17231                  */
17232                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17233                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17234                 }
17235
17236                 dtrace_enabling_provide(NULL);
17237                 state = dtrace_anon.dta_state;
17238
17239                 /*
17240                  * We couldn't hold cpu_lock across the above call to
17241                  * dtrace_enabling_provide(), but we must hold it to actually
17242                  * enable the probes.  We have to drop all of our locks, pick
17243                  * up cpu_lock, and regain our locks before matching the
17244                  * retained anonymous enabling.
17245                  */
17246                 lck_mtx_unlock(&dtrace_lock);
17247                 lck_mtx_unlock(&dtrace_provider_lock);
17248
17249                 lck_mtx_lock(&cpu_lock);
17250                 lck_mtx_lock(&dtrace_provider_lock);
17251                 lck_mtx_lock(&dtrace_lock);
17252
17253                 if ((enab = dtrace_anon.dta_enabling) != NULL)
17254                         (void) dtrace_enabling_match(enab, NULL, NULL);
17255
17256                 lck_mtx_unlock(&cpu_lock);
17257         }
17258
17259         lck_mtx_unlock(&dtrace_lock);
17260         lck_mtx_unlock(&dtrace_provider_lock);
17261
17262         if (state != NULL) {
17263                 /*
17264                  * If we created any anonymous state, set it going now.
17265                  */
17266                 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
17267         }
17268
17269         return (DDI_SUCCESS);
17270 }
17271
17272 /*ARGSUSED*/
17273 static int
17274 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
17275 {
17276 #pragma unused(flag, otyp)
17277         dtrace_state_t *state;
17278         uint32_t priv;
17279         uid_t uid;
17280         zoneid_t zoneid;
17281         int rv;
17282
17283         /* APPLE: Darwin puts Helper on its own major device. */
17284
17285         /*
17286          * If no DTRACE_PRIV_* bits are set in the credential, then the
17287          * caller lacks sufficient permission to do anything with DTrace.
17288          */
17289         dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
17290         if (priv == DTRACE_PRIV_NONE)
17291                 return (EACCES);
17292
17293         /*
17294          * APPLE NOTE: We delay the initialization of fasttrap as late as possible.
17295          * It certainly can't be later than now!
17296          */
17297         fasttrap_init();
17298
17299         /*
17300          * Ask all providers to provide all their probes.
17301          */
17302         lck_mtx_lock(&dtrace_provider_lock);
17303         dtrace_probe_provide(NULL, NULL);
17304         lck_mtx_unlock(&dtrace_provider_lock);
17305
17306         lck_mtx_lock(&cpu_lock);
17307         lck_mtx_lock(&dtrace_lock);
17308         dtrace_opens++;
17309         dtrace_membar_producer();
17310
17311 #ifdef illumos
17312         /*
17313          * If the kernel debugger is active (that is, if the kernel debugger
17314          * modified text in some way), we won't allow the open.
17315          */
17316         if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
17317                 dtrace_opens--;
17318                 lck_mtx_unlock(&dtrace_lock);
17319                 lck_mtx_unlock(&cpu_lock);
17320                 return (EBUSY);
17321         }
17322 #endif
17323
17324         rv = dtrace_state_create(devp, cred_p, &state);
17325         lck_mtx_unlock(&cpu_lock);
17326
17327         if (rv != 0 || state == NULL) {
17328                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17329 #ifdef illumos
17330                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17331 #endif
17332                 }
17333                 lck_mtx_unlock(&dtrace_lock);
17334                 /* propagate EAGAIN or ERESTART */
17335                 return (rv);
17336         }
17337
17338         lck_mtx_unlock(&dtrace_lock);
17339
17340         lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17341
17342         /*
17343          * If we are currently lazy, transition states.
17344          *
17345          * Unlike dtrace_close, we do not need to check the
17346          * value of dtrace_opens, as any positive value (and
17347          * we count as 1) means we transition states.
17348          */
17349         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
17350                 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
17351                 /*
17352                  * We do not need to hold the exclusive lock while processing
17353                  * DOF on processes. We do need to make sure the mode does not get
17354                  * changed to DTRACE_DOF_MODE_LAZY_ON during that stage though
17355                  * (which should not happen anyway since it only happens in
17356                  * dtrace_close). There is no way imcomplete USDT probes can be
17357                  * activate by any DTrace clients here since they all have to
17358                  * call dtrace_open and be blocked on dtrace_dof_mode_lock
17359                  */
17360                 lck_rw_lock_exclusive_to_shared(&dtrace_dof_mode_lock);
17361                 /*
17362                  * Iterate all existing processes and load lazy dofs.
17363                  */
17364                 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
17365                              dtrace_lazy_dofs_proc_iterate_doit,
17366                              NULL,
17367                              dtrace_lazy_dofs_proc_iterate_filter,
17368                              NULL);
17369
17370                 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
17371         }
17372         else {
17373                 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17374         }
17375
17376
17377         /*
17378          * Update kernel symbol state.
17379          *
17380          * We must own the provider and dtrace locks.
17381          *
17382          * NOTE! It may appear there is a race by setting this value so late
17383          * after dtrace_probe_provide. However, any kext loaded after the
17384          * call to probe provide and before we set LAZY_OFF will be marked as
17385          * eligible for symbols from userspace. The same dtrace that is currently
17386          * calling dtrace_open() (this call!) will get a list of kexts needing
17387          * symbols and fill them in, thus closing the race window.
17388          *
17389          * We want to set this value only after it certain it will succeed, as
17390          * this significantly reduces the complexity of error exits.
17391          */
17392         lck_mtx_lock(&dtrace_lock);
17393         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17394                 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17395         }
17396         lck_mtx_unlock(&dtrace_lock);
17397
17398         return (0);
17399 }
17400
17401 /*ARGSUSED*/
17402 static int
17403 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17404 {
17405 #pragma unused(flag, otyp, cred_p) /* __APPLE__ */
17406         minor_t minor = getminor(dev);
17407         dtrace_state_t *state;
17408
17409         /* APPLE NOTE: Darwin puts Helper on its own major device. */
17410         state = dtrace_state_get(minor);
17411
17412         lck_mtx_lock(&cpu_lock);
17413         lck_mtx_lock(&dtrace_lock);
17414
17415         if (state->dts_anon) {
17416                 /*
17417                  * There is anonymous state. Destroy that first.
17418                  */
17419                 ASSERT(dtrace_anon.dta_state == NULL);
17420                 dtrace_state_destroy(state->dts_anon);
17421         }
17422
17423         dtrace_state_destroy(state);
17424         ASSERT(dtrace_opens > 0);
17425
17426         /*
17427          * Only relinquish control of the kernel debugger interface when there
17428          * are no consumers and no anonymous enablings.
17429          */
17430         if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17431 #ifdef illumos
17432                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17433 #endif
17434         }
17435
17436         lck_mtx_unlock(&dtrace_lock);
17437         lck_mtx_unlock(&cpu_lock);
17438
17439         /*
17440          * Lock ordering requires the dof mode lock be taken before
17441          * the dtrace_lock.
17442          */
17443         lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17444         lck_mtx_lock(&dtrace_lock);
17445
17446         if (dtrace_opens == 0) {
17447                 /*
17448                  * If we are currently lazy-off, and this is the last close, transition to
17449                  * lazy state.
17450                  */
17451                 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
17452                         dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17453                 }
17454
17455                 /*
17456                  * If we are the last dtrace client, switch back to lazy (from userspace) symbols
17457                  */
17458                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
17459                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17460                 }
17461         }
17462
17463         lck_mtx_unlock(&dtrace_lock);
17464         lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17465
17466         /*
17467          * Kext probes may be retained past the end of the kext's lifespan. The
17468          * probes are kept until the last reference to them has been removed.
17469          * Since closing an active dtrace context is likely to drop that last reference,
17470          * lets take a shot at cleaning out the orphaned probes now.
17471          */
17472         dtrace_module_unloaded(NULL);
17473
17474         return (0);
17475 }
17476
17477 /*ARGSUSED*/
17478 static int
17479 dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
17480 {
17481 #pragma unused(rv)
17482         /*
17483          * Safe to check this outside the dof mode lock
17484          */
17485         if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
17486                 return KERN_SUCCESS;
17487
17488         switch (cmd) {
17489 #if defined (__arm64__)
17490         case DTRACEHIOC_ADDDOF_U32:
17491         case DTRACEHIOC_ADDDOF_U64:
17492 #else
17493         case DTRACEHIOC_ADDDOF:
17494 #endif /* __arm64__*/
17495                         {
17496                         dof_helper_t *dhp = NULL;
17497                         size_t dof_ioctl_data_size;
17498                         dof_ioctl_data_t* multi_dof;
17499                         unsigned int i;
17500                         int rval = 0;
17501                         user_addr_t user_address = *(user_addr_t*)arg;
17502                         uint64_t dof_count;
17503                         int multi_dof_claimed = 0;
17504                         proc_t* p = current_proc();
17505
17506                         /*
17507                          * If this is a restricted process and dtrace is restricted,
17508                          * do not allow DOFs to be registered
17509                          */
17510                         if (dtrace_is_restricted() &&
17511                                 !dtrace_are_restrictions_relaxed() &&
17512                                 !dtrace_can_attach_to_proc(current_proc())) {
17513                                 return (EACCES);
17514                         }
17515
17516                         /*
17517                          * Read the number of DOF sections being passed in.
17518                          */
17519                         if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
17520                                    &dof_count,
17521                                    sizeof(dof_count))) {
17522                                 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
17523                                 return (EFAULT);
17524                         }
17525
17526                         /*
17527                          * Range check the count.
17528                          */
17529                         if (dof_count == 0 || dof_count > 1024) {
17530                                 dtrace_dof_error(NULL, "dofiod_count is not valid");
17531                                 return (EINVAL);
17532                         }
17533
17534                         /*
17535                          * Allocate a correctly sized structure and copyin the data.
17536                          */
17537                         dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
17538                         if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
17539                                 return (ENOMEM);
17540
17541                         /* NOTE! We can no longer exit this method via return */
17542                         if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
17543                                 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
17544                                 rval = EFAULT;
17545                                 goto cleanup;
17546                         }
17547
17548                         /*
17549                          * Check that the count didn't change between the first copyin and the second.
17550                          */
17551                         if (multi_dof->dofiod_count != dof_count) {
17552                                 rval = EINVAL;
17553                                 goto cleanup;
17554                         }
17555
17556                         /*
17557                          * Try to process lazily first.
17558                          */
17559                         rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
17560
17561                         /*
17562                          * If rval is EACCES, we must be non-lazy.
17563                          */
17564                         if (rval == EACCES) {
17565                                 rval = 0;
17566                                 /*
17567                                  * Process each dof_helper_t
17568                                  */
17569                                 i = 0;
17570                                 do {
17571                                         dhp = &multi_dof->dofiod_helpers[i];
17572
17573                                         dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
17574
17575                                         if (dof != NULL) {
17576                                                 lck_mtx_lock(&dtrace_meta_lock);
17577                                                 lck_mtx_lock(&dtrace_lock);
17578
17579                                                 /*
17580                                                  * dtrace_helper_slurp() takes responsibility for the dof --
17581                                                  * it may free it now or it may save it and free it later.
17582                                                  */
17583                                                 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
17584                                                         rval = EINVAL;
17585                                                 }
17586
17587                                                 lck_mtx_unlock(&dtrace_lock);
17588                                                 lck_mtx_unlock(&dtrace_meta_lock);
17589                                         }
17590                                 } while (++i < multi_dof->dofiod_count && rval == 0);
17591                         }
17592
17593                         /*
17594                          * We need to copyout the multi_dof struct, because it contains
17595                          * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
17596                          *
17597                          * This could certainly be better optimized.
17598                          */
17599                         if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
17600                                 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
17601                                 /* Don't overwrite pre-existing error code */
17602                                 if (rval == 0) rval = EFAULT;
17603                         }
17604
17605                 cleanup:
17606                         /*
17607                          * If we had to allocate struct memory, free it.
17608                          */
17609                         if (multi_dof != NULL && !multi_dof_claimed) {
17610                                 kmem_free(multi_dof, dof_ioctl_data_size);
17611                         }
17612
17613                         return rval;
17614                 }
17615
17616                 case DTRACEHIOC_REMOVE: {
17617                         int generation = *(int*)arg;
17618                         proc_t* p = current_proc();
17619
17620                         /*
17621                          * Try lazy first.
17622                          */
17623                         int rval = dtrace_lazy_dofs_remove(p, generation);
17624
17625                         /*
17626                          * EACCES means non-lazy
17627                          */
17628                         if (rval == EACCES) {
17629                                 lck_mtx_lock(&dtrace_meta_lock);
17630                                 lck_mtx_lock(&dtrace_lock);
17631                                 rval = dtrace_helper_destroygen(p, generation);
17632                                 lck_mtx_unlock(&dtrace_lock);
17633                                 lck_mtx_unlock(&dtrace_meta_lock);
17634                         }
17635
17636                         return (rval);
17637                 }
17638
17639                 default:
17640                         break;
17641         }
17642
17643         return ENOTTY;
17644 }
17645
17646 /*ARGSUSED*/
17647 static int
17648 dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
17649 {
17650 #pragma unused(md)
17651         minor_t minor = getminor(dev);
17652         dtrace_state_t *state;
17653         int rval;
17654
17655         /* Darwin puts Helper on its own major device. */
17656
17657         state = dtrace_state_get(minor);
17658
17659         if (state->dts_anon) {
17660            ASSERT(dtrace_anon.dta_state == NULL);
17661            state = state->dts_anon;
17662         }
17663
17664         switch (cmd) {
17665         case DTRACEIOC_PROVIDER: {
17666                 dtrace_providerdesc_t pvd;
17667                 dtrace_provider_t *pvp;
17668
17669                 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
17670                         return (EFAULT);
17671
17672                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17673                 lck_mtx_lock(&dtrace_provider_lock);
17674
17675                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17676                         if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
17677                                 break;
17678                 }
17679
17680                 lck_mtx_unlock(&dtrace_provider_lock);
17681
17682                 if (pvp == NULL)
17683                         return (ESRCH);
17684
17685                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17686                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17687                 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
17688                         return (EFAULT);
17689
17690                 return (0);
17691         }
17692
17693         case DTRACEIOC_EPROBE: {
17694                 dtrace_eprobedesc_t epdesc;
17695                 dtrace_ecb_t *ecb;
17696                 dtrace_action_t *act;
17697                 void *buf;
17698                 size_t size;
17699                 uintptr_t dest;
17700                 int nrecs;
17701
17702                 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
17703                         return (EFAULT);
17704
17705                 lck_mtx_lock(&dtrace_lock);
17706
17707                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17708                         lck_mtx_unlock(&dtrace_lock);
17709                         return (EINVAL);
17710                 }
17711
17712                 if (ecb->dte_probe == NULL) {
17713                         lck_mtx_unlock(&dtrace_lock);
17714                         return (EINVAL);
17715                 }
17716
17717                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17718                 epdesc.dtepd_uarg = ecb->dte_uarg;
17719                 epdesc.dtepd_size = ecb->dte_size;
17720
17721                 nrecs = epdesc.dtepd_nrecs;
17722                 epdesc.dtepd_nrecs = 0;
17723                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17724                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17725                                 continue;
17726
17727                         epdesc.dtepd_nrecs++;
17728                 }
17729
17730                 /*
17731                  * Now that we have the size, we need to allocate a temporary
17732                  * buffer in which to store the complete description.  We need
17733                  * the temporary buffer to be able to drop dtrace_lock()
17734                  * across the copyout(), below.
17735                  */
17736                 size = sizeof (dtrace_eprobedesc_t) +
17737                         (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17738
17739                 buf = kmem_alloc(size, KM_SLEEP);
17740                 dest = (uintptr_t)buf;
17741
17742                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17743                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17744
17745                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17746                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17747                                 continue;
17748
17749                         if (nrecs-- == 0)
17750                                 break;
17751
17752                         bcopy(&act->dta_rec, (void *)dest,
17753                         sizeof (dtrace_recdesc_t));
17754                         dest += sizeof (dtrace_recdesc_t);
17755                 }
17756
17757                 lck_mtx_unlock(&dtrace_lock);
17758
17759                 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17760                         kmem_free(buf, size);
17761                         return (EFAULT);
17762                 }
17763
17764                 kmem_free(buf, size);
17765                 return (0);
17766         }
17767
17768         case DTRACEIOC_AGGDESC: {
17769                 dtrace_aggdesc_t aggdesc;
17770                 dtrace_action_t *act;
17771                 dtrace_aggregation_t *agg;
17772                 int nrecs;
17773                 uint32_t offs;
17774                 dtrace_recdesc_t *lrec;
17775                 void *buf;
17776                 size_t size;
17777                 uintptr_t dest;
17778
17779                 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
17780                         return (EFAULT);
17781
17782                 lck_mtx_lock(&dtrace_lock);
17783
17784                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17785                         lck_mtx_unlock(&dtrace_lock);
17786                         return (EINVAL);
17787                 }
17788
17789                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17790
17791                 nrecs = aggdesc.dtagd_nrecs;
17792                 aggdesc.dtagd_nrecs = 0;
17793
17794                 offs = agg->dtag_base;
17795                 lrec = &agg->dtag_action.dta_rec;
17796                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17797
17798                 for (act = agg->dtag_first; ; act = act->dta_next) {
17799                         ASSERT(act->dta_intuple ||
17800                         DTRACEACT_ISAGG(act->dta_kind));
17801
17802                         /*
17803                          * If this action has a record size of zero, it
17804                          * denotes an argument to the aggregating action.
17805                          * Because the presence of this record doesn't (or
17806                          * shouldn't) affect the way the data is interpreted,
17807                          * we don't copy it out to save user-level the
17808                          * confusion of dealing with a zero-length record.
17809                          */
17810                         if (act->dta_rec.dtrd_size == 0) {
17811                                 ASSERT(agg->dtag_hasarg);
17812                                 continue;
17813                         }
17814
17815                         aggdesc.dtagd_nrecs++;
17816
17817                         if (act == &agg->dtag_action)
17818                                 break;
17819                 }
17820
17821                 /*
17822                  * Now that we have the size, we need to allocate a temporary
17823                  * buffer in which to store the complete description.  We need
17824                  * the temporary buffer to be able to drop dtrace_lock()
17825                  * across the copyout(), below.
17826                  */
17827                 size = sizeof (dtrace_aggdesc_t) +
17828                         (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17829
17830                 buf = kmem_alloc(size, KM_SLEEP);
17831                 dest = (uintptr_t)buf;
17832
17833                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17834                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17835
17836                 for (act = agg->dtag_first; ; act = act->dta_next) {
17837                         dtrace_recdesc_t rec = act->dta_rec;
17838
17839                         /*
17840                          * See the comment in the above loop for why we pass
17841                          * over zero-length records.
17842                          */
17843                         if (rec.dtrd_size == 0) {
17844                                 ASSERT(agg->dtag_hasarg);
17845                                 continue;
17846                         }
17847
17848                         if (nrecs-- == 0)
17849                                 break;
17850
17851                         rec.dtrd_offset -= offs;
17852                         bcopy(&rec, (void *)dest, sizeof (rec));
17853                         dest += sizeof (dtrace_recdesc_t);
17854
17855                         if (act == &agg->dtag_action)
17856                                 break;
17857                 }
17858
17859                 lck_mtx_unlock(&dtrace_lock);
17860
17861                 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17862                         kmem_free(buf, size);
17863                         return (EFAULT);
17864                 }
17865
17866                 kmem_free(buf, size);
17867                 return (0);
17868         }
17869
17870         case DTRACEIOC_ENABLE: {
17871                 dof_hdr_t *dof;
17872                 dtrace_enabling_t *enab = NULL;
17873                 dtrace_vstate_t *vstate;
17874                 int err = 0;
17875
17876                 *rv = 0;
17877
17878                 /*
17879                  * If a NULL argument has been passed, we take this as our
17880                  * cue to reevaluate our enablings.
17881                  */
17882                 if (arg == 0) {
17883                         dtrace_enabling_matchall();
17884
17885                         return (0);
17886                 }
17887
17888                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17889                         return (rval);
17890
17891                 lck_mtx_lock(&cpu_lock);
17892                 lck_mtx_lock(&dtrace_lock);
17893                 vstate = &state->dts_vstate;
17894
17895                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17896                         lck_mtx_unlock(&dtrace_lock);
17897                         lck_mtx_unlock(&cpu_lock);
17898                         dtrace_dof_destroy(dof);
17899                         return (EBUSY);
17900                 }
17901
17902                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17903                         lck_mtx_unlock(&dtrace_lock);
17904                         lck_mtx_unlock(&cpu_lock);
17905                         dtrace_dof_destroy(dof);
17906                         return (EINVAL);
17907                 }
17908
17909                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17910                         dtrace_enabling_destroy(enab);
17911                         lck_mtx_unlock(&dtrace_lock);
17912                         lck_mtx_unlock(&cpu_lock);
17913                         dtrace_dof_destroy(dof);
17914                         return (rval);
17915                 }
17916
17917                 if ((err = dtrace_enabling_match(enab, rv, NULL)) == 0) {
17918                         err = dtrace_enabling_retain(enab);
17919                 } else {
17920                         dtrace_enabling_destroy(enab);
17921                 }
17922
17923                 lck_mtx_unlock(&dtrace_lock);
17924                 lck_mtx_unlock(&cpu_lock);
17925                 dtrace_dof_destroy(dof);
17926
17927                 return (err);
17928         }
17929
17930         case DTRACEIOC_REPLICATE: {
17931                 dtrace_repldesc_t desc;
17932                 dtrace_probedesc_t *match = &desc.dtrpd_match;
17933                 dtrace_probedesc_t *create = &desc.dtrpd_create;
17934                 int err;
17935
17936                 if (copyin(arg, &desc, sizeof (desc)) != 0)
17937                         return (EFAULT);
17938
17939                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17940                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17941                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17942                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17943
17944                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17945                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17946                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17947                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17948
17949                 lck_mtx_lock(&dtrace_lock);
17950                 err = dtrace_enabling_replicate(state, match, create);
17951                 lck_mtx_unlock(&dtrace_lock);
17952
17953                 return (err);
17954         }
17955
17956         case DTRACEIOC_PROBEMATCH:
17957         case DTRACEIOC_PROBES: {
17958                 dtrace_probe_t *probe = NULL;
17959                 dtrace_probedesc_t desc;
17960                 dtrace_probekey_t pkey;
17961                 dtrace_id_t i;
17962                 int m = 0;
17963                 uint32_t priv;
17964                 uid_t uid;
17965                 zoneid_t zoneid;
17966
17967                 if (copyin(arg, &desc, sizeof (desc)) != 0)
17968                         return (EFAULT);
17969
17970                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17971                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17972                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17973                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17974
17975                 /*
17976                  * Before we attempt to match this probe, we want to give
17977                  * all providers the opportunity to provide it.
17978                  */
17979                 if (desc.dtpd_id == DTRACE_IDNONE) {
17980                         lck_mtx_lock(&dtrace_provider_lock);
17981                         dtrace_probe_provide(&desc, NULL);
17982                         lck_mtx_unlock(&dtrace_provider_lock);
17983                         desc.dtpd_id++;
17984                 }
17985
17986                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17987
17988                 lck_mtx_lock(&dtrace_lock);
17989
17990                 if (cmd == DTRACEIOC_PROBEMATCH)  {
17991                         dtrace_probekey(&desc, &pkey);
17992                         pkey.dtpk_id = DTRACE_IDNONE;
17993
17994                         /* Quiet compiler warning */
17995                         for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
17996                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17997                                         (m = dtrace_match_probe(probe, &pkey,
17998                                         priv, uid, zoneid)) != 0)
17999                                         break;
18000                         }
18001
18002                         if (m < 0) {
18003                                 lck_mtx_unlock(&dtrace_lock);
18004                                 return (EINVAL);
18005                         }
18006                         dtrace_probekey_release(&pkey);
18007
18008                 } else {
18009                         /* Quiet compiler warning */
18010                         for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18011                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
18012                                         dtrace_match_priv(probe, priv, uid, zoneid))
18013                                         break;
18014                         }
18015                 }
18016
18017                 if (probe == NULL) {
18018                         lck_mtx_unlock(&dtrace_lock);
18019                         return (ESRCH);
18020                 }
18021
18022                 dtrace_probe_description(probe, &desc);
18023                 lck_mtx_unlock(&dtrace_lock);
18024
18025                 if (copyout(&desc, arg, sizeof (desc)) != 0)
18026                         return (EFAULT);
18027
18028                 return (0);
18029         }
18030
18031         case DTRACEIOC_PROBEARG: {
18032                 dtrace_argdesc_t desc;
18033                 dtrace_probe_t *probe;
18034                 dtrace_provider_t *prov;
18035
18036                 if (copyin(arg, &desc, sizeof (desc)) != 0)
18037                         return (EFAULT);
18038
18039                 if (desc.dtargd_id == DTRACE_IDNONE)
18040                         return (EINVAL);
18041
18042                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
18043                         return (EINVAL);
18044
18045                 lck_mtx_lock(&dtrace_provider_lock);
18046                 lck_mtx_lock(&mod_lock);
18047                 lck_mtx_lock(&dtrace_lock);
18048
18049                 /* Quiet compiler warning */
18050                 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
18051                         lck_mtx_unlock(&dtrace_lock);
18052                         lck_mtx_unlock(&mod_lock);
18053                         lck_mtx_unlock(&dtrace_provider_lock);
18054                         return (EINVAL);
18055                 }
18056
18057                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
18058                         lck_mtx_unlock(&dtrace_lock);
18059                         lck_mtx_unlock(&mod_lock);
18060                         lck_mtx_unlock(&dtrace_provider_lock);
18061                         return (EINVAL);
18062                 }
18063
18064                 lck_mtx_unlock(&dtrace_lock);
18065
18066                 prov = probe->dtpr_provider;
18067
18068                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
18069                 /*
18070                  * There isn't any typed information for this probe.
18071                  * Set the argument number to DTRACE_ARGNONE.
18072                  */
18073                         desc.dtargd_ndx = DTRACE_ARGNONE;
18074                 } else {
18075                         desc.dtargd_native[0] = '\0';
18076                         desc.dtargd_xlate[0] = '\0';
18077                         desc.dtargd_mapping = desc.dtargd_ndx;
18078
18079                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
18080                         probe->dtpr_id, probe->dtpr_arg, &desc);
18081                 }
18082
18083                 lck_mtx_unlock(&mod_lock);
18084                 lck_mtx_unlock(&dtrace_provider_lock);
18085
18086                 if (copyout(&desc, arg, sizeof (desc)) != 0)
18087                         return (EFAULT);
18088
18089                 return (0);
18090         }
18091
18092         case DTRACEIOC_GO: {
18093                 processorid_t cpuid;
18094                 rval = dtrace_state_go(state, &cpuid);
18095
18096                 if (rval != 0)
18097                         return (rval);
18098
18099                 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18100                         return (EFAULT);
18101
18102                 return (0);
18103         }
18104
18105         case DTRACEIOC_STOP: {
18106                 processorid_t cpuid;
18107
18108                 lck_mtx_lock(&dtrace_lock);
18109                 rval = dtrace_state_stop(state, &cpuid);
18110                 lck_mtx_unlock(&dtrace_lock);
18111
18112                 if (rval != 0)
18113                         return (rval);
18114
18115                 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18116                         return (EFAULT);
18117
18118                 return (0);
18119         }
18120
18121         case DTRACEIOC_DOFGET: {
18122                 dof_hdr_t hdr, *dof;
18123                 uint64_t len;
18124
18125                 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
18126                         return (EFAULT);
18127
18128                 lck_mtx_lock(&dtrace_lock);
18129                 dof = dtrace_dof_create(state);
18130                 lck_mtx_unlock(&dtrace_lock);
18131
18132                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
18133                 rval = copyout(dof, arg, len);
18134                 dtrace_dof_destroy(dof);
18135
18136                 return (rval == 0 ? 0 : EFAULT);
18137         }
18138
18139         case DTRACEIOC_SLEEP: {
18140                 int64_t time;
18141                 uint64_t abstime;
18142                 uint64_t rvalue = DTRACE_WAKE_TIMEOUT;
18143
18144                 if (copyin(arg, &time, sizeof(time)) != 0)
18145                         return (EFAULT);
18146
18147                 nanoseconds_to_absolutetime((uint64_t)time, &abstime);
18148                 clock_absolutetime_interval_to_deadline(abstime, &abstime);
18149
18150                 if (assert_wait_deadline(state, THREAD_ABORTSAFE, abstime) == THREAD_WAITING) {
18151                         if (state->dts_buf_over_limit > 0) {
18152                                 clear_wait(current_thread(), THREAD_INTERRUPTED);
18153                                 rvalue = DTRACE_WAKE_BUF_LIMIT;
18154                         } else {
18155                                 thread_block(THREAD_CONTINUE_NULL);
18156                                 if (state->dts_buf_over_limit > 0) {
18157                                         rvalue = DTRACE_WAKE_BUF_LIMIT;
18158                                 }
18159                         }
18160                 }
18161
18162                 if (copyout(&rvalue, arg, sizeof(rvalue)) != 0)
18163                         return (EFAULT);
18164
18165                 return (0);
18166         }
18167
18168         case DTRACEIOC_SIGNAL: {
18169                 wakeup(state);
18170                 return (0);
18171         }
18172
18173         case DTRACEIOC_AGGSNAP:
18174         case DTRACEIOC_BUFSNAP: {
18175                 dtrace_bufdesc_t desc;
18176                 caddr_t cached;
18177                 boolean_t over_limit;
18178                 dtrace_buffer_t *buf;
18179
18180                 if (copyin(arg, &desc, sizeof (desc)) != 0)
18181                         return (EFAULT);
18182
18183                 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
18184                         return (EINVAL);
18185
18186                 lck_mtx_lock(&dtrace_lock);
18187
18188                 if (cmd == DTRACEIOC_BUFSNAP) {
18189                         buf = &state->dts_buffer[desc.dtbd_cpu];
18190                 } else {
18191                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
18192                 }
18193
18194                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
18195                         size_t sz = buf->dtb_offset;
18196
18197                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
18198                                 lck_mtx_unlock(&dtrace_lock);
18199                                 return (EBUSY);
18200                         }
18201
18202                         /*
18203                          * If this buffer has already been consumed, we're
18204                          * going to indicate that there's nothing left here
18205                          * to consume.
18206                          */
18207                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
18208                                 lck_mtx_unlock(&dtrace_lock);
18209
18210                                 desc.dtbd_size = 0;
18211                                 desc.dtbd_drops = 0;
18212                                 desc.dtbd_errors = 0;
18213                                 desc.dtbd_oldest = 0;
18214                                 sz = sizeof (desc);
18215
18216                                 if (copyout(&desc, arg, sz) != 0)
18217                                         return (EFAULT);
18218
18219                                 return (0);
18220                         }
18221
18222                         /*
18223                          * If this is a ring buffer that has wrapped, we want
18224                          * to copy the whole thing out.
18225                          */
18226                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
18227                                 dtrace_buffer_polish(buf);
18228                                 sz = buf->dtb_size;
18229                         }
18230
18231                         if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
18232                                 lck_mtx_unlock(&dtrace_lock);
18233                                 return (EFAULT);
18234                         }
18235
18236                         desc.dtbd_size = sz;
18237                         desc.dtbd_drops = buf->dtb_drops;
18238                         desc.dtbd_errors = buf->dtb_errors;
18239                         desc.dtbd_oldest = buf->dtb_xamot_offset;
18240                         desc.dtbd_timestamp = dtrace_gethrtime();
18241
18242                         lck_mtx_unlock(&dtrace_lock);
18243
18244                         if (copyout(&desc, arg, sizeof (desc)) != 0)
18245                                 return (EFAULT);
18246
18247                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
18248
18249                         return (0);
18250                 }
18251
18252                 if (buf->dtb_tomax == NULL) {
18253                         ASSERT(buf->dtb_xamot == NULL);
18254                         lck_mtx_unlock(&dtrace_lock);
18255                         return (ENOENT);
18256                 }
18257
18258                 cached = buf->dtb_tomax;
18259                 over_limit = buf->dtb_cur_limit == buf->dtb_size;
18260
18261                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18262
18263                 dtrace_xcall(desc.dtbd_cpu,
18264                         (dtrace_xcall_t)dtrace_buffer_switch, buf);
18265
18266                 state->dts_errors += buf->dtb_xamot_errors;
18267
18268                 /*
18269                 * If the buffers did not actually switch, then the cross call
18270                 * did not take place -- presumably because the given CPU is
18271                 * not in the ready set.  If this is the case, we'll return
18272                 * ENOENT.
18273                 */
18274                 if (buf->dtb_tomax == cached) {
18275                         ASSERT(buf->dtb_xamot != cached);
18276                         lck_mtx_unlock(&dtrace_lock);
18277                         return (ENOENT);
18278                 }
18279
18280                 ASSERT(cached == buf->dtb_xamot);
18281                 /*
18282                  * At this point we know the buffer have switched, so we
18283                  * can decrement the over limit count if the buffer was over
18284                  * its limit. The new buffer might already be over its limit
18285                  * yet, but we don't care since we're guaranteed not to be
18286                  * checking the buffer over limit count  at this point.
18287                  */
18288                 if (over_limit) {
18289                         uint32_t old = os_atomic_dec_orig(&state->dts_buf_over_limit, relaxed);
18290                         #pragma unused(old)
18291
18292                         /*
18293                          * Verify that we didn't underflow the value
18294                          */
18295                         ASSERT(old != 0);
18296                 }
18297
18298                 /*
18299                 * We have our snapshot; now copy it out.
18300                 */
18301                 if (dtrace_buffer_copyout(buf->dtb_xamot,
18302                                         (user_addr_t)desc.dtbd_data,
18303                                         buf->dtb_xamot_offset) != 0) {
18304                         lck_mtx_unlock(&dtrace_lock);
18305                         return (EFAULT);
18306                 }
18307
18308                 desc.dtbd_size = buf->dtb_xamot_offset;
18309                 desc.dtbd_drops = buf->dtb_xamot_drops;
18310                 desc.dtbd_errors = buf->dtb_xamot_errors;
18311                 desc.dtbd_oldest = 0;
18312                 desc.dtbd_timestamp = buf->dtb_switched;
18313
18314                 lck_mtx_unlock(&dtrace_lock);
18315
18316                 /*
18317                  * Finally, copy out the buffer description.
18318                  */
18319                 if (copyout(&desc, arg, sizeof (desc)) != 0)
18320                         return (EFAULT);
18321
18322                 return (0);
18323         }
18324
18325         case DTRACEIOC_CONF: {
18326                 dtrace_conf_t conf;
18327
18328                 bzero(&conf, sizeof (conf));
18329                 conf.dtc_difversion = DIF_VERSION;
18330                 conf.dtc_difintregs = DIF_DIR_NREGS;
18331                 conf.dtc_diftupregs = DIF_DTR_NREGS;
18332                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18333
18334                 if (copyout(&conf, arg, sizeof (conf)) != 0)
18335                         return (EFAULT);
18336
18337                 return (0);
18338         }
18339
18340         case DTRACEIOC_STATUS: {
18341                 dtrace_status_t stat;
18342                 dtrace_dstate_t *dstate;
18343                 int i, j;
18344                 uint64_t nerrs;
18345
18346                 /*
18347                 * See the comment in dtrace_state_deadman() for the reason
18348                 * for setting dts_laststatus to INT64_MAX before setting
18349                 * it to the correct value.
18350                 */
18351                 state->dts_laststatus = INT64_MAX;
18352                 dtrace_membar_producer();
18353                 state->dts_laststatus = dtrace_gethrtime();
18354
18355                 bzero(&stat, sizeof (stat));
18356
18357                 lck_mtx_lock(&dtrace_lock);
18358
18359                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18360                         lck_mtx_unlock(&dtrace_lock);
18361                         return (ENOENT);
18362                 }
18363
18364                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18365                         stat.dtst_exiting = 1;
18366
18367                 nerrs = state->dts_errors;
18368                 dstate = &state->dts_vstate.dtvs_dynvars;
18369
18370                 for (i = 0; i < (int)NCPU; i++) {
18371                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
18372
18373                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
18374                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18375                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18376
18377                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18378                                 stat.dtst_filled++;
18379
18380                         nerrs += state->dts_buffer[i].dtb_errors;
18381
18382                         for (j = 0; j < state->dts_nspeculations; j++) {
18383                                 dtrace_speculation_t *spec;
18384                                 dtrace_buffer_t *buf;
18385
18386                                 spec = &state->dts_speculations[j];
18387                                 buf = &spec->dtsp_buffer[i];
18388                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
18389                         }
18390                 }
18391
18392                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
18393                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18394                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18395                 stat.dtst_dblerrors = state->dts_dblerrors;
18396                 stat.dtst_killed =
18397                         (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18398                 stat.dtst_errors = nerrs;
18399
18400                 lck_mtx_unlock(&dtrace_lock);
18401
18402                 if (copyout(&stat, arg, sizeof (stat)) != 0)
18403                         return (EFAULT);
18404
18405                 return (0);
18406         }
18407
18408         case DTRACEIOC_FORMAT: {
18409                 dtrace_fmtdesc_t fmt;
18410                 char *str;
18411                 int len;
18412
18413                 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
18414                         return (EFAULT);
18415
18416                 lck_mtx_lock(&dtrace_lock);
18417
18418                 if (fmt.dtfd_format == 0 ||
18419                         fmt.dtfd_format > state->dts_nformats) {
18420                         lck_mtx_unlock(&dtrace_lock);
18421                         return (EINVAL);
18422                 }
18423
18424                 /*
18425                  * Format strings are allocated contiguously and they are
18426                  * never freed; if a format index is less than the number
18427                  * of formats, we can assert that the format map is non-NULL
18428                  * and that the format for the specified index is non-NULL.
18429                  */
18430                 ASSERT(state->dts_formats != NULL);
18431                 str = state->dts_formats[fmt.dtfd_format - 1];
18432                 ASSERT(str != NULL);
18433
18434                 len = strlen(str) + 1;
18435
18436                 if (len > fmt.dtfd_length) {
18437                         fmt.dtfd_length = len;
18438
18439                         if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
18440                                 lck_mtx_unlock(&dtrace_lock);
18441                                 return (EINVAL);
18442                         }
18443                 } else {
18444                         if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
18445                                 lck_mtx_unlock(&dtrace_lock);
18446                                 return (EINVAL);
18447                         }
18448                 }
18449
18450                 lck_mtx_unlock(&dtrace_lock);
18451                 return (0);
18452         }
18453
18454         case DTRACEIOC_MODUUIDSLIST: {
18455                 size_t module_uuids_list_size;
18456                 dtrace_module_uuids_list_t* uuids_list;
18457                 uint64_t dtmul_count;
18458
18459                 /*
18460                  * Security restrictions make this operation illegal, if this is enabled DTrace
18461                  * must refuse to provide any fbt probes.
18462                  */
18463                 if (dtrace_fbt_probes_restricted()) {
18464                         cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18465                         return (EPERM);
18466                 }
18467
18468                 /*
18469                  * Fail if the kernel symbol mode makes this operation illegal.
18470                  * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18471                  * for them without holding the dtrace_lock.
18472                  */
18473                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18474                     dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18475                         cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
18476                         return (EPERM);
18477                 }
18478
18479                 /*
18480                  * Read the number of symbolsdesc structs being passed in.
18481                  */
18482                 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
18483                            &dtmul_count,
18484                            sizeof(dtmul_count))) {
18485                         cmn_err(CE_WARN, "failed to copyin dtmul_count");
18486                         return (EFAULT);
18487                 }
18488
18489                 /*
18490                  * Range check the count. More than 2k kexts is probably an error.
18491                  */
18492                 if (dtmul_count > 2048) {
18493                         cmn_err(CE_WARN, "dtmul_count is not valid");
18494                         return (EINVAL);
18495                 }
18496
18497                 /*
18498                  * For all queries, we return EINVAL when the user specified
18499                  * count does not match the actual number of modules we find
18500                  * available.
18501                  *
18502                  * If the user specified count is zero, then this serves as a
18503                  * simple query to count the available modules in need of symbols.
18504                  */
18505
18506                 rval = 0;
18507
18508                 if (dtmul_count == 0)
18509                 {
18510                         lck_mtx_lock(&mod_lock);
18511                         struct modctl* ctl = dtrace_modctl_list;
18512                         while (ctl) {
18513                                 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18514                                 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18515                                         dtmul_count++;
18516                                         rval = EINVAL;
18517                                 }
18518                                 ctl = ctl->mod_next;
18519                         }
18520                         lck_mtx_unlock(&mod_lock);
18521
18522                         if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
18523                                 return (EFAULT);
18524                         else
18525                                 return (rval);
18526                 }
18527
18528                 /*
18529                  * If we reach this point, then we have a request for full list data.
18530                  * Allocate a correctly sized structure and copyin the data.
18531                  */
18532                 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
18533                 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
18534                         return (ENOMEM);
18535
18536                 /* NOTE! We can no longer exit this method via return */
18537                 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
18538                         cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
18539                         rval = EFAULT;
18540                         goto moduuidslist_cleanup;
18541                 }
18542
18543                 /*
18544                  * Check that the count didn't change between the first copyin and the second.
18545                  */
18546                 if (uuids_list->dtmul_count != dtmul_count) {
18547                         rval = EINVAL;
18548                         goto moduuidslist_cleanup;
18549                 }
18550
18551                 /*
18552                  * Build the list of UUID's that need symbols
18553                  */
18554                 lck_mtx_lock(&mod_lock);
18555
18556                 dtmul_count = 0;
18557
18558                 struct modctl* ctl = dtrace_modctl_list;
18559                 while (ctl) {
18560                         /*
18561                          * We assume that userspace symbols will be "better" than kernel level symbols,
18562                          * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
18563                          * are available, add user syms if the module might use them.
18564                          */
18565                         ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18566                         if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18567                                 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
18568                                 if (dtmul_count++ < uuids_list->dtmul_count) {
18569                                         memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
18570                                 }
18571                         }
18572                         ctl = ctl->mod_next;
18573                 }
18574
18575                 lck_mtx_unlock(&mod_lock);
18576
18577                 if (uuids_list->dtmul_count < dtmul_count)
18578                         rval = EINVAL;
18579
18580                 uuids_list->dtmul_count = dtmul_count;
18581
18582                 /*
18583                  * Copyout the symbols list (or at least the count!)
18584                  */
18585                 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
18586                         cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
18587                         rval = EFAULT;
18588                 }
18589
18590         moduuidslist_cleanup:
18591                 /*
18592                  * If we had to allocate struct memory, free it.
18593                  */
18594                 if (uuids_list != NULL) {
18595                         kmem_free(uuids_list, module_uuids_list_size);
18596                 }
18597
18598                 return rval;
18599         }
18600
18601         case DTRACEIOC_PROVMODSYMS: {
18602                 size_t module_symbols_size;
18603                 dtrace_module_symbols_t* module_symbols;
18604                 uint64_t dtmodsyms_count;
18605
18606                 /*
18607                  * Security restrictions make this operation illegal, if this is enabled DTrace
18608                  * must refuse to provide any fbt probes.
18609                  */
18610                 if (dtrace_fbt_probes_restricted()) {
18611                         cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18612                         return (EPERM);
18613                 }
18614
18615                 /*
18616                  * Fail if the kernel symbol mode makes this operation illegal.
18617                  * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18618                  * for them without holding the dtrace_lock.
18619                  */
18620                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18621                     dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18622                         cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
18623                         return (EPERM);
18624                 }
18625
18626                 /*
18627                  * Read the number of module symbols structs being passed in.
18628                  */
18629                 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
18630                            &dtmodsyms_count,
18631                            sizeof(dtmodsyms_count))) {
18632                         cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
18633                         return (EFAULT);
18634                 }
18635
18636                 /*
18637                  * Range check the count. How much data can we pass around?
18638                  * FIX ME!
18639                  */
18640                 if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) {
18641                         cmn_err(CE_WARN, "dtmodsyms_count is not valid");
18642                         return (EINVAL);
18643                 }
18644
18645                 /*
18646                  * Allocate a correctly sized structure and copyin the data.
18647                  */
18648                 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
18649                 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
18650                         return (ENOMEM);
18651
18652                 rval = 0;
18653
18654                 /* NOTE! We can no longer exit this method via return */
18655                 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
18656                         cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t");
18657                         rval = EFAULT;
18658                         goto module_symbols_cleanup;
18659                 }
18660
18661                 /*
18662                  * Check that the count didn't change between the first copyin and the second.
18663                  */
18664                 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
18665                         rval = EINVAL;
18666                         goto module_symbols_cleanup;
18667                 }
18668
18669                 /*
18670                  * Find the modctl to add symbols to.
18671                  */
18672                 lck_mtx_lock(&dtrace_provider_lock);
18673                 lck_mtx_lock(&mod_lock);
18674
18675                 struct modctl* ctl = dtrace_modctl_list;
18676                 while (ctl) {
18677                         ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18678                         if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
18679                                 dtrace_provider_t *prv;
18680                                 ctl->mod_user_symbols = module_symbols;
18681
18682                                 /*
18683                                  * We're going to call each providers per-module provide operation
18684                                  * specifying only this module.
18685                                  */
18686                                 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
18687                                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
18688                                 /*
18689                                  * We gave every provider a chance to provide with the user syms, go ahead and clear them
18690                                  */
18691                                 ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
18692                         }
18693                         ctl = ctl->mod_next;
18694                 }
18695
18696                 lck_mtx_unlock(&mod_lock);
18697                 lck_mtx_unlock(&dtrace_provider_lock);
18698
18699         module_symbols_cleanup:
18700                 /*
18701                  * If we had to allocate struct memory, free it.
18702                  */
18703                 if (module_symbols != NULL) {
18704                         kmem_free(module_symbols, module_symbols_size);
18705                 }
18706
18707                 return rval;
18708         }
18709
18710         case DTRACEIOC_PROCWAITFOR: {
18711                 dtrace_procdesc_t pdesc = {
18712                         .p_name = {0},
18713                         .p_pid  = -1
18714                 };
18715
18716                 if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != 0)
18717                         goto proc_waitfor_error;
18718
18719                 if ((rval = dtrace_proc_waitfor(&pdesc)) != 0)
18720                         goto proc_waitfor_error;
18721
18722                 if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != 0)
18723                         goto proc_waitfor_error;
18724
18725                 return 0;
18726
18727         proc_waitfor_error:
18728                 /* The process was suspended, revert this since the client will not do it. */
18729                 if (pdesc.p_pid != -1) {
18730                         proc_t *proc = proc_find(pdesc.p_pid);
18731                         if (proc != PROC_NULL) {
18732                                 task_pidresume(proc->task);
18733                                 proc_rele(proc);
18734                         }
18735                 }
18736
18737                 return rval;
18738         }
18739
18740         default:
18741                 break;
18742         }
18743
18744         return (ENOTTY);
18745 }
18746
18747 /*
18748  * APPLE NOTE:  dtrace_detach not implemented
18749  */
18750 #if !defined(__APPLE__)
18751 /*ARGSUSED*/
18752 static int
18753 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18754 {
18755         dtrace_state_t *state;
18756
18757         switch (cmd) {
18758         case DDI_DETACH:
18759                 break;
18760
18761         case DDI_SUSPEND:
18762                 return (DDI_SUCCESS);
18763
18764         default:
18765                 return (DDI_FAILURE);
18766         }
18767
18768         lck_mtx_lock(&cpu_lock);
18769         lck_mtx_lock(&dtrace_provider_lock);
18770         lck_mtx_lock(&dtrace_lock);
18771
18772         ASSERT(dtrace_opens == 0);
18773
18774         if (dtrace_helpers > 0) {
18775                 lck_mtx_unlock(&dtrace_lock);
18776                 lck_mtx_unlock(&dtrace_provider_lock);
18777                 lck_mtx_unlock(&cpu_lock);
18778                 return (DDI_FAILURE);
18779         }
18780
18781         if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
18782                 lck_mtx_unlock(&dtrace_lock);
18783                 lck_mtx_unlock(&dtrace_provider_lock);
18784                 lck_mtx_unlock(&cpu_lock);
18785                 return (DDI_FAILURE);
18786         }
18787
18788         dtrace_provider = NULL;
18789
18790         if ((state = dtrace_anon_grab()) != NULL) {
18791                 /*
18792                  * If there were ECBs on this state, the provider should
18793                  * have not been allowed to detach; assert that there is
18794                  * none.
18795                  */
18796                 ASSERT(state->dts_necbs == 0);
18797                 dtrace_state_destroy(state);
18798
18799                 /*
18800                  * If we're being detached with anonymous state, we need to
18801                  * indicate to the kernel debugger that DTrace is now inactive.
18802                  */
18803                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18804         }
18805
18806         bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18807         unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18808         dtrace_cpu_init = NULL;
18809         dtrace_helpers_cleanup = NULL;
18810         dtrace_helpers_fork = NULL;
18811         dtrace_cpustart_init = NULL;
18812         dtrace_cpustart_fini = NULL;
18813         dtrace_debugger_init = NULL;
18814         dtrace_debugger_fini = NULL;
18815         dtrace_kreloc_init = NULL;
18816         dtrace_kreloc_fini = NULL;
18817         dtrace_modload = NULL;
18818         dtrace_modunload = NULL;
18819
18820         lck_mtx_unlock(&cpu_lock);
18821
18822         if (dtrace_helptrace_enabled) {
18823                 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
18824                 dtrace_helptrace_buffer = NULL;
18825         }
18826
18827         kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
18828         dtrace_probes = NULL;
18829         dtrace_nprobes = 0;
18830
18831         dtrace_hash_destroy(dtrace_strings);
18832         dtrace_hash_destroy(dtrace_byprov);
18833         dtrace_hash_destroy(dtrace_bymod);
18834         dtrace_hash_destroy(dtrace_byfunc);
18835         dtrace_hash_destroy(dtrace_byname);
18836         dtrace_strings = NULL;
18837         dtrace_byprov = NULL;
18838         dtrace_bymod = NULL;
18839         dtrace_byfunc = NULL;
18840         dtrace_byname = NULL;
18841
18842         kmem_cache_destroy(dtrace_state_cache);
18843         vmem_destroy(dtrace_arena);
18844
18845         if (dtrace_toxrange != NULL) {
18846                 kmem_free(dtrace_toxrange,
18847                     dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
18848                 dtrace_toxrange = NULL;
18849                 dtrace_toxranges = 0;
18850                 dtrace_toxranges_max = 0;
18851         }
18852
18853         ddi_remove_minor_node(dtrace_devi, NULL);
18854         dtrace_devi = NULL;
18855
18856         ddi_soft_state_fini(&dtrace_softstate);
18857
18858         ASSERT(dtrace_vtime_references == 0);
18859         ASSERT(dtrace_opens == 0);
18860         ASSERT(dtrace_retained == NULL);
18861
18862         lck_mtx_unlock(&dtrace_lock);
18863         lck_mtx_unlock(&dtrace_provider_lock);
18864
18865 #ifdef illumos
18866         /*
18867          * We don't destroy the task queue until after we have dropped our
18868          * locks (taskq_destroy() may block on running tasks).  To prevent
18869          * attempting to do work after we have effectively detached but before
18870          * the task queue has been destroyed, all tasks dispatched via the
18871          * task queue must check that DTrace is still attached before
18872          * performing any operation.
18873          */
18874         taskq_destroy(dtrace_taskq);
18875         dtrace_taskq = NULL;
18876 #endif
18877
18878         return (DDI_SUCCESS);
18879 }
18880 #endif  /* __APPLE__ */
18881
18882 d_open_t _dtrace_open, helper_open;
18883 d_close_t _dtrace_close, helper_close;
18884 d_ioctl_t _dtrace_ioctl, helper_ioctl;
18885
18886 int
18887 _dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
18888 {
18889 #pragma unused(p)
18890         dev_t locdev = dev;
18891
18892         return  dtrace_open( &locdev, flags, devtype, CRED());
18893 }
18894
18895 int
18896 helper_open(dev_t dev, int flags, int devtype, struct proc *p)
18897 {
18898 #pragma unused(dev,flags,devtype,p)
18899         return 0;
18900 }
18901
18902 int
18903 _dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
18904 {
18905 #pragma unused(p)
18906         return dtrace_close( dev, flags, devtype, CRED());
18907 }
18908
18909 int
18910 helper_close(dev_t dev, int flags, int devtype, struct proc *p)
18911 {
18912 #pragma unused(dev,flags,devtype,p)
18913         return 0;
18914 }
18915
18916 int
18917 _dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18918 {
18919 #pragma unused(p)
18920         int err, rv = 0;
18921     user_addr_t uaddrp;
18922
18923     if (proc_is64bit(p))
18924                 uaddrp = *(user_addr_t *)data;
18925         else
18926                 uaddrp = (user_addr_t) *(uint32_t *)data;
18927
18928         err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
18929
18930         /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
18931         if (err != 0) {
18932                 ASSERT( (err & 0xfffff000) == 0 );
18933                 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
18934         } else if (rv != 0) {
18935                 ASSERT( (rv & 0xfff00000) == 0 );
18936                 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
18937         } else
18938                 return 0;
18939 }
18940
18941 int
18942 helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18943 {
18944 #pragma unused(dev,fflag,p)
18945         int err, rv = 0;
18946
18947         err = dtrace_ioctl_helper(cmd, data, &rv);
18948         /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
18949         if (err != 0) {
18950                 ASSERT( (err & 0xfffff000) == 0 );
18951                 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
18952         } else if (rv != 0) {
18953                 ASSERT( (rv & 0xfff00000) == 0 );
18954                 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
18955         } else
18956                 return 0;
18957 }
18958
18959 #define HELPER_MAJOR  -24 /* let the kernel pick the device number */
18960
18961 /*
18962  * A struct describing which functions will get invoked for certain
18963  * actions.
18964  */
18965 static struct cdevsw helper_cdevsw =
18966 {
18967         helper_open,            /* open */
18968         helper_close,           /* close */
18969         eno_rdwrt,                      /* read */
18970         eno_rdwrt,                      /* write */
18971         helper_ioctl,           /* ioctl */
18972         (stop_fcn_t *)nulldev, /* stop */
18973         (reset_fcn_t *)nulldev, /* reset */
18974         NULL,                           /* tty's */
18975         eno_select,                     /* select */
18976         eno_mmap,                       /* mmap */
18977         eno_strat,                      /* strategy */
18978         eno_getc,                       /* getc */
18979         eno_putc,                       /* putc */
18980         0                                       /* type */
18981 };
18982
18983 static int helper_majdevno = 0;
18984
18985 static int gDTraceInited = 0;
18986
18987 void
18988 helper_init( void )
18989 {
18990         /*
18991          * Once the "helper" is initialized, it can take ioctl calls that use locks
18992          * and zones initialized in dtrace_init. Make certain dtrace_init was called
18993          * before us.
18994          */
18995
18996         if (!gDTraceInited) {
18997                 panic("helper_init before dtrace_init\n");
18998         }
18999
19000         if (0 >= helper_majdevno)
19001         {
19002                 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
19003
19004                 if (helper_majdevno < 0) {
19005                         printf("helper_init: failed to allocate a major number!\n");
19006                         return;
19007                 }
19008
19009                 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19010                                         DTRACEMNR_HELPER, 0 )) {
19011                         printf("dtrace_init: failed to devfs_make_node for helper!\n");
19012                         return;
19013                 }
19014         } else
19015                 panic("helper_init: called twice!\n");
19016 }
19017
19018 #undef HELPER_MAJOR
19019
19020 static int
19021 dtrace_clone_func(dev_t dev, int action)
19022 {
19023 #pragma unused(dev)
19024
19025         if (action == DEVFS_CLONE_ALLOC) {
19026                 return dtrace_state_reserve();
19027         }
19028         else if (action == DEVFS_CLONE_FREE) {
19029                 return 0;
19030         }
19031         else return -1;
19032 }
19033
19034 void dtrace_ast(void);
19035
19036 void
19037 dtrace_ast(void)
19038 {
19039         int i;
19040         uint32_t clients = os_atomic_xchg(&dtrace_wake_clients, 0, relaxed);
19041         if (clients == 0)
19042                 return;
19043         /**
19044          * We disable preemption here to be sure that we won't get
19045          * interrupted by a wakeup to a thread that is higher
19046          * priority than us, so that we do issue all wakeups
19047          */
19048         disable_preemption();
19049         for (i = 0; i < DTRACE_NCLIENTS; i++) {
19050                 if (clients & (1 << i)) {
19051                         dtrace_state_t *state = dtrace_state_get(i);
19052                         if (state) {
19053                                 wakeup(state);
19054                         }
19055
19056                 }
19057         }
19058         enable_preemption();
19059 }
19060
19061
19062 #define DTRACE_MAJOR  -24 /* let the kernel pick the device number */
19063
19064 static struct cdevsw dtrace_cdevsw =
19065 {
19066         _dtrace_open,           /* open */
19067         _dtrace_close,          /* close */
19068         eno_rdwrt,                      /* read */
19069         eno_rdwrt,                      /* write */
19070         _dtrace_ioctl,          /* ioctl */
19071         (stop_fcn_t *)nulldev, /* stop */
19072         (reset_fcn_t *)nulldev, /* reset */
19073         NULL,                           /* tty's */
19074         eno_select,                     /* select */
19075         eno_mmap,                       /* mmap */
19076         eno_strat,                      /* strategy */
19077         eno_getc,                       /* getc */
19078         eno_putc,                       /* putc */
19079         0                                       /* type */
19080 };
19081
19082 lck_attr_t* dtrace_lck_attr;
19083 lck_grp_attr_t* dtrace_lck_grp_attr;
19084 lck_grp_t* dtrace_lck_grp;
19085
19086 static int gMajDevNo;
19087
19088 void dtrace_early_init (void)
19089 {
19090         dtrace_restriction_policy_load();
19091
19092         /*
19093          * See dtrace_impl.h for a description of kernel symbol modes.
19094          * The default is to wait for symbols from userspace (lazy symbols).
19095          */
19096         if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
19097                 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
19098         }
19099 }
19100
19101 void
19102 dtrace_init( void )
19103 {
19104         if (0 == gDTraceInited) {
19105                 int i, ncpu;
19106                 size_t size = sizeof(dtrace_buffer_memory_maxsize);
19107
19108                 /*
19109                  * DTrace allocates buffers based on the maximum number
19110                  * of enabled cpus. This call avoids any race when finding
19111                  * that count.
19112                  */
19113                 ASSERT(dtrace_max_cpus == 0);
19114                 ncpu = dtrace_max_cpus = ml_get_max_cpus();
19115
19116                 /*
19117                  * Retrieve the size of the physical memory in order to define
19118                  * the state buffer memory maximal size.  If we cannot retrieve
19119                  * this value, we'll consider that we have 1Gb of memory per CPU, that's
19120                  * still better than raising a kernel panic.
19121                  */
19122                 if (0 != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
19123                                              &size, NULL, 0))
19124                 {
19125                         dtrace_buffer_memory_maxsize = ncpu * 1024 * 1024 * 1024;
19126                         printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
19127                                dtrace_buffer_memory_maxsize);
19128                 }
19129
19130                 /*
19131                  * Finally, divide by three to prevent DTrace from eating too
19132                  * much memory.
19133                  */
19134                 dtrace_buffer_memory_maxsize /= 3;
19135                 ASSERT(dtrace_buffer_memory_maxsize > 0);
19136
19137                 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
19138
19139                 if (gMajDevNo < 0) {
19140                         printf("dtrace_init: failed to allocate a major number!\n");
19141                         gDTraceInited = 0;
19142                         return;
19143                 }
19144
19145                 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19146                                         dtrace_clone_func, DTRACEMNR_DTRACE, 0 )) {
19147                         printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
19148                         gDTraceInited = 0;
19149                         return;
19150                 }
19151
19152                 /*
19153                  * Allocate the dtrace_probe_t zone
19154                  */
19155                 dtrace_probe_t_zone = zinit(sizeof(dtrace_probe_t),
19156                                             1024 * sizeof(dtrace_probe_t),
19157                                             sizeof(dtrace_probe_t),
19158                                             "dtrace.dtrace_probe_t");
19159
19160                 /*
19161                  * Create the dtrace lock group and attrs.
19162                  */
19163                 dtrace_lck_attr = lck_attr_alloc_init();
19164                 dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
19165                 dtrace_lck_grp = lck_grp_alloc_init("dtrace",  dtrace_lck_grp_attr);
19166
19167                 /*
19168                  * We have to initialize all locks explicitly
19169                  */
19170                 lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
19171                 lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
19172                 lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
19173                 lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr);
19174 #if DEBUG
19175                 lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
19176 #endif
19177                 lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
19178
19179                 /*
19180                  * The cpu_core structure consists of per-CPU state available in any context.
19181                  * On some architectures, this may mean that the page(s) containing the
19182                  * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
19183                  * is up to the platform to assure that this is performed properly.  Note that
19184                  * the structure is sized to avoid false sharing.
19185                  */
19186                 lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
19187                 lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr);
19188                 lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
19189
19190                 /*
19191                  * Initialize the CPU offline/online hooks.
19192                  */
19193                 dtrace_install_cpu_hooks();
19194
19195                 dtrace_modctl_list = NULL;
19196
19197                 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
19198                 for (i = 0; i < ncpu; ++i) {
19199                         lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
19200                 }
19201
19202                 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
19203                 for (i = 0; i < ncpu; ++i) {
19204                         cpu_list[i].cpu_id = (processorid_t)i;
19205                         cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
19206                         LIST_INIT(&cpu_list[i].cpu_cyc_list);
19207                         lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
19208                 }
19209
19210                 lck_mtx_lock(&cpu_lock);
19211                 for (i = 0; i < ncpu; ++i)
19212                         /* FIXME: track CPU configuration */
19213                         dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
19214                 lck_mtx_unlock(&cpu_lock);
19215
19216                 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
19217
19218                 dtrace_strings = dtrace_hash_create(dtrace_strkey_offset,
19219                     offsetof(dtrace_string_t, dtst_str),
19220                     offsetof(dtrace_string_t, dtst_next),
19221                     offsetof(dtrace_string_t, dtst_prev));
19222
19223                 dtrace_isa_init();
19224                 /*
19225                  * See dtrace_impl.h for a description of dof modes.
19226                  * The default is lazy dof.
19227                  *
19228                  * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
19229                  * makes no sense...
19230                  */
19231                 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
19232 #if CONFIG_EMBEDDED
19233                         /* Disable DOF mode by default for performance reasons */
19234                         dtrace_dof_mode = DTRACE_DOF_MODE_NEVER;
19235 #else
19236                         dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
19237 #endif
19238                 }
19239
19240                 /*
19241                  * Sanity check of dof mode value.
19242                  */
19243                 switch (dtrace_dof_mode) {
19244                         case DTRACE_DOF_MODE_NEVER:
19245                         case DTRACE_DOF_MODE_LAZY_ON:
19246                                 /* valid modes, but nothing else we need to do */
19247                                 break;
19248
19249                         case DTRACE_DOF_MODE_LAZY_OFF:
19250                         case DTRACE_DOF_MODE_NON_LAZY:
19251                                 /* Cannot wait for a dtrace_open to init fasttrap */
19252                                 fasttrap_init();
19253                                 break;
19254
19255                         default:
19256                                 /* Invalid, clamp to non lazy */
19257                                 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
19258                                 fasttrap_init();
19259                                 break;
19260                 }
19261
19262 #if CONFIG_DTRACE
19263         if (dtrace_dof_mode != DTRACE_DOF_MODE_NEVER)
19264             commpage_update_dof(true);
19265 #endif
19266
19267                 gDTraceInited = 1;
19268
19269         } else
19270                 panic("dtrace_init: called twice!\n");
19271 }
19272
19273 void
19274 dtrace_postinit(void)
19275 {
19276         /*
19277          * Called from bsd_init after all provider's *_init() routines have been
19278          * run. That way, anonymous DOF enabled under dtrace_attach() is safe
19279          * to go.
19280          */
19281         dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0)); /* Punning a dev_t to a dev_info_t* */
19282
19283         /*
19284          * Add the mach_kernel to the module list for lazy processing
19285          */
19286         struct kmod_info fake_kernel_kmod;
19287         memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
19288
19289         strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
19290         fake_kernel_kmod.id = 1;
19291         fake_kernel_kmod.address = g_kernel_kmod_info.address;
19292         fake_kernel_kmod.size = g_kernel_kmod_info.size;
19293
19294         if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) {
19295                 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
19296         }
19297
19298         (void)OSKextRegisterKextsWithDTrace();
19299 }
19300 #undef DTRACE_MAJOR
19301
19302 /*
19303  * Routines used to register interest in cpu's being added to or removed
19304  * from the system.
19305  */
19306 void
19307 register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19308 {
19309 #pragma unused(ignore1,ignore2)
19310 }
19311
19312 void
19313 unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19314 {
19315 #pragma unused(ignore1,ignore2)
19316 }