bsd/dev/dtrace/dtrace.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved.
  24  * Portions Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26
  27 /*
  28  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29  * Use is subject to license terms.
  30  */
  31
  32 /*
  33  * DTrace - Dynamic Tracing for Solaris
  34  *
  35  * This is the implementation of the Solaris Dynamic Tracing framework
  36  * (DTrace).  The user-visible interface to DTrace is described at length in
  37  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
  38  * library, the in-kernel DTrace framework, and the DTrace providers are
  39  * described in the block comments in the <sys/dtrace.h> header file.  The
  40  * internal architecture of DTrace is described in the block comments in the
  41  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
  42  * implementation very much assume mastery of all of these sources; if one has
  43  * an unanswered question about the implementation, one should consult them
  44  * first.
  45  *
  46  * The functions here are ordered roughly as follows:
  47  *
  48  *   - Probe context functions
  49  *   - Probe hashing functions
  50  *   - Non-probe context utility functions
  51  *   - Matching functions
  52  *   - Provider-to-Framework API functions
  53  *   - Probe management functions
  54  *   - DIF object functions
  55  *   - Format functions
  56  *   - Predicate functions
  57  *   - ECB functions
  58  *   - Buffer functions
  59  *   - Enabling functions
  60  *   - DOF functions
  61  *   - Anonymous enabling functions
  62  *   - Process functions
  63  *   - Consumer state functions
  64  *   - Helper functions
  65  *   - Hook functions
  66  *   - Driver cookbook functions
  67  *
  68  * Each group of functions begins with a block comment labelled the "DTrace
  69  * [Group] Functions", allowing one to find each block by searching forward
  70  * on capital-f functions.
  71  */
  72 #include <sys/errno.h>
  73 #include <sys/types.h>
  74 #include <sys/stat.h>
  75 #include <sys/conf.h>
  76 #include <sys/random.h>
  77 #include <sys/systm.h>
  78 #include <sys/dtrace_impl.h>
  79 #include <sys/param.h>
  80 #include <sys/proc_internal.h>
  81 #include <sys/ioctl.h>
  82 #include <sys/fcntl.h>
  83 #include <miscfs/devfs/devfs.h>
  84 #include <sys/malloc.h>
  85 #include <sys/kernel_types.h>
  86 #include <sys/proc_internal.h>
  87 #include <sys/uio_internal.h>
  88 #include <sys/kauth.h>
  89 #include <vm/pmap.h>
  90 #include <sys/user.h>
  91 #include <mach/exception_types.h>
  92 #include <sys/signalvar.h>
  93 #include <mach/task.h>
  94 #include <kern/zalloc.h>
  95 #include <kern/ast.h>
  96 #include <kern/sched_prim.h>
  97 #include <kern/task.h>
  98 #include <netinet/in.h>
  99 #include <libkern/sysctl.h>
 100 #include <sys/kdebug.h>
 101
 102 #if MONOTONIC
 103 #include <kern/monotonic.h>
 104 #include <machine/monotonic.h>
 105 #endif /* MONOTONIC */
 106
 107 #include "dtrace_xoroshiro128_plus.h"
 108
 109 #include <IOKit/IOPlatformExpert.h>
 110
 111 #include <kern/cpu_data.h>
 112 extern uint32_t pmap_find_phys(void *, uint64_t);
 113 extern boolean_t pmap_valid_page(uint32_t);
 114 extern void OSKextRegisterKextsWithDTrace(void);
 115 extern kmod_info_t g_kernel_kmod_info;
 116 extern void commpage_update_dof(boolean_t enabled);
 117
 118 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
 119 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
 120
 121 #define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
 122
 123 extern void dtrace_suspend(void);
 124 extern void dtrace_resume(void);
 125 extern void dtrace_early_init(void);
 126 extern int dtrace_keep_kernel_symbols(void);
 127 extern void dtrace_init(void);
 128 extern void helper_init(void);
 129 extern void fasttrap_init(void);
 130
 131 static int  dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
 132 extern void dtrace_lazy_dofs_destroy(proc_t *);
 133 extern void dtrace_postinit(void);
 134
 135 extern void dtrace_proc_fork(proc_t*, proc_t*, int);
 136 extern void dtrace_proc_exec(proc_t*);
 137 extern void dtrace_proc_exit(proc_t*);
 138
 139 /*
 140  * DTrace Tunable Variables
 141  *
 142  * The following variables may be dynamically tuned by using sysctl(8), the
 143  * variables being stored in the kern.dtrace namespace.  For example:
 144  *      sysctl kern.dtrace.dof_maxsize = 1048575        # 1M
 145  *
 146  * In general, the only variables that one should be tuning this way are those
 147  * that affect system-wide DTrace behavior, and for which the default behavior
 148  * is undesirable.  Most of these variables are tunable on a per-consumer
 149  * basis using DTrace options, and need not be tuned on a system-wide basis.
 150  * When tuning these variables, avoid pathological values; while some attempt
 151  * is made to verify the integrity of these variables, they are not considered
 152  * part of the supported interface to DTrace, and they are therefore not
 153  * checked comprehensively.
 154  */
 155 uint64_t        dtrace_buffer_memory_maxsize = 0;               /* initialized in dtrace_init */
 156 uint64_t        dtrace_buffer_memory_inuse = 0;
 157 int             dtrace_destructive_disallow = 0;
 158 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 159 size_t          dtrace_difo_maxsize = (256 * 1024);
 160 dtrace_optval_t dtrace_dof_maxsize = (512 * 1024);
 161 dtrace_optval_t dtrace_statvar_maxsize = (16 * 1024);
 162 dtrace_optval_t dtrace_statvar_maxsize_max = (16 * 10 * 1024);
 163 size_t          dtrace_actions_max = (16 * 1024);
 164 size_t          dtrace_retain_max = 1024;
 165 dtrace_optval_t dtrace_helper_actions_max = 32;
 166 dtrace_optval_t dtrace_helper_providers_max = 64;
 167 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
 168 size_t          dtrace_strsize_default = 256;
 169 dtrace_optval_t dtrace_strsize_min = 8;
 170 dtrace_optval_t dtrace_strsize_max = 65536;
 171 dtrace_optval_t dtrace_cleanrate_default = 990099000;           /* 1.1 hz */
 172 dtrace_optval_t dtrace_cleanrate_min = 20000000;                        /* 50 hz */
 173 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
 174 dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
 175 dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
 176 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
 177 dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
 178 dtrace_optval_t dtrace_nspec_default = 1;
 179 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
 180 dtrace_optval_t dtrace_stackframes_default = 20;
 181 dtrace_optval_t dtrace_ustackframes_default = 20;
 182 dtrace_optval_t dtrace_jstackframes_default = 50;
 183 dtrace_optval_t dtrace_jstackstrsize_default = 512;
 184 dtrace_optval_t dtrace_buflimit_default = 75;
 185 dtrace_optval_t dtrace_buflimit_min = 1;
 186 dtrace_optval_t dtrace_buflimit_max = 99;
 187 size_t          dtrace_nprobes_default = 4;
 188 int             dtrace_msgdsize_max = 128;
 189 hrtime_t        dtrace_chill_max = 500 * (NANOSEC / MILLISEC);  /* 500 ms */
 190 hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
 191 int             dtrace_devdepth_max = 32;
 192 int             dtrace_err_verbose;
 193 hrtime_t        dtrace_deadman_interval = NANOSEC;
 194 hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 195 hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
 196
 197 /*
 198  * DTrace External Variables
 199  *
 200  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
 201  * available to DTrace consumers via the backtick (`) syntax.  One of these,
 202  * dtrace_zero, is made deliberately so:  it is provided as a source of
 203  * well-known, zero-filled memory.  While this variable is not documented,
 204  * it is used by some translators as an implementation detail.
 205  */
 206 const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
 207 unsigned int    dtrace_max_cpus = 0;            /* number of enabled cpus */
 208 /*
 209  * DTrace Internal Variables
 210  */
 211 static dev_info_t       *dtrace_devi;           /* device info */
 212 static vmem_t           *dtrace_arena;          /* probe ID arena */
 213 static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
 214 static int              dtrace_nprobes;         /* number of probes */
 215 static dtrace_provider_t *dtrace_provider;      /* provider list */
 216 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 217 static int              dtrace_opens;           /* number of opens */
 218 static int              dtrace_helpers;         /* number of helpers */
 219 static dtrace_hash_t    *dtrace_strings;
 220 static dtrace_hash_t    *dtrace_byprov;         /* probes hashed by provider */
 221 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 222 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 223 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 224 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
 225 static int              dtrace_toxranges;       /* number of toxic ranges */
 226 static int              dtrace_toxranges_max;   /* size of toxic range array */
 227 static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
 228 static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
 229 static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
 230 static kthread_t        *dtrace_panicked;       /* panicking thread */
 231 static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
 232 static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
 233 static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
 234 static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
 235 static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
 236 static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
 237
 238 static int              dtrace_dof_mode;        /* See dtrace_impl.h for a description of Darwin's dof modes. */
 239
 240                         /*
 241                          * This does't quite fit as an internal variable, as it must be accessed in
 242                          * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
 243                          */
 244 int                     dtrace_kernel_symbol_mode;      /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
 245 static uint32_t         dtrace_wake_clients;
 246 static uint8_t      dtrace_kerneluuid[16];      /* the 128-bit uuid */
 247
 248 /*
 249  * To save memory, some common memory allocations are given a
 250  * unique zone. For example, dtrace_probe_t is 72 bytes in size,
 251  * which means it would fall into the kalloc.128 bucket. With
 252  * 20k elements allocated, the space saved is substantial.
 253  */
 254
 255 struct zone *dtrace_probe_t_zone;
 256
 257 static int dtrace_module_unloaded(struct kmod_info *kmod);
 258
 259 /*
 260  * DTrace Locking
 261  * DTrace is protected by three (relatively coarse-grained) locks:
 262  *
 263  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
 264  *     including enabling state, probes, ECBs, consumer state, helper state,
 265  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
 266  *     probe context is lock-free -- synchronization is handled via the
 267  *     dtrace_sync() cross call mechanism.
 268  *
 269  * (2) dtrace_provider_lock is required when manipulating provider state, or
 270  *     when provider state must be held constant.
 271  *
 272  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
 273  *     when meta provider state must be held constant.
 274  *
 275  * The lock ordering between these three locks is dtrace_meta_lock before
 276  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
 277  * several places where dtrace_provider_lock is held by the framework as it
 278  * calls into the providers -- which then call back into the framework,
 279  * grabbing dtrace_lock.)
 280  *
 281  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
 282  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
 283  * role as a coarse-grained lock; it is acquired before both of these locks.
 284  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
 285  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
 286  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
 287  * acquired _between_ dtrace_provider_lock and dtrace_lock.
 288  */
 289
 290
 291 /*
 292  * APPLE NOTE:
 293  *
 294  * For porting purposes, all kmutex_t vars have been changed
 295  * to lck_mtx_t, which require explicit initialization.
 296  *
 297  * kmutex_t becomes lck_mtx_t
 298  * mutex_enter() becomes lck_mtx_lock()
 299  * mutex_exit() becomes lck_mtx_unlock()
 300  *
 301  * Lock asserts are changed like this:
 302  *
 303  * ASSERT(MUTEX_HELD(&cpu_lock));
 304  *      becomes:
 305  * LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
 306  *
 307  */
 308 static lck_mtx_t        dtrace_lock;            /* probe state lock */
 309 static lck_mtx_t        dtrace_provider_lock;   /* provider state lock */
 310 static lck_mtx_t        dtrace_meta_lock;       /* meta-provider state lock */
 311 static lck_rw_t         dtrace_dof_mode_lock;   /* dof mode lock */
 312
 313 /*
 314  * DTrace Provider Variables
 315  *
 316  * These are the variables relating to DTrace as a provider (that is, the
 317  * provider of the BEGIN, END, and ERROR probes).
 318  */
 319 static dtrace_pattr_t   dtrace_provider_attr = {
 320 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 321 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 322 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 323 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 324 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 325 };
 326
 327 static void
 328 dtrace_provide_nullop(void *arg, const dtrace_probedesc_t *desc)
 329 {
 330 #pragma unused(arg, desc)
 331 }
 332
 333 static void
 334 dtrace_provide_module_nullop(void *arg, struct modctl *ctl)
 335 {
 336 #pragma unused(arg, ctl)
 337 }
 338
 339 static int
 340 dtrace_enable_nullop(void *arg, dtrace_id_t id, void *parg)
 341 {
 342 #pragma unused(arg, id, parg)
 343     return (0);
 344 }
 345
 346 static void
 347 dtrace_disable_nullop(void *arg, dtrace_id_t id, void *parg)
 348 {
 349 #pragma unused(arg, id, parg)
 350 }
 351
 352 static void
 353 dtrace_suspend_nullop(void *arg, dtrace_id_t id, void *parg)
 354 {
 355 #pragma unused(arg, id, parg)
 356 }
 357
 358 static void
 359 dtrace_resume_nullop(void *arg, dtrace_id_t id, void *parg)
 360 {
 361 #pragma unused(arg, id, parg)
 362 }
 363
 364 static void
 365 dtrace_destroy_nullop(void *arg, dtrace_id_t id, void *parg)
 366 {
 367 #pragma unused(arg, id, parg)
 368 }
 369
 370
 371 static dtrace_pops_t dtrace_provider_ops = {
 372         .dtps_provide = dtrace_provide_nullop,
 373         .dtps_provide_module =  dtrace_provide_module_nullop,
 374         .dtps_enable =  dtrace_enable_nullop,
 375         .dtps_disable = dtrace_disable_nullop,
 376         .dtps_suspend = dtrace_suspend_nullop,
 377         .dtps_resume =  dtrace_resume_nullop,
 378         .dtps_getargdesc =      NULL,
 379         .dtps_getargval =       NULL,
 380         .dtps_usermode =        NULL,
 381         .dtps_destroy = dtrace_destroy_nullop,
 382 };
 383
 384 static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
 385 static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
 386 dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
 387
 388 /*
 389  * DTrace Helper Tracing Variables
 390  */
 391 uint32_t dtrace_helptrace_next = 0;
 392 uint32_t dtrace_helptrace_nlocals;
 393 char    *dtrace_helptrace_buffer;
 394 size_t  dtrace_helptrace_bufsize = 512 * 1024;
 395
 396 #if DEBUG
 397 int     dtrace_helptrace_enabled = 1;
 398 #else
 399 int     dtrace_helptrace_enabled = 0;
 400 #endif
 401
 402 #if defined (__arm64__)
 403 /*
 404  * The ioctl for adding helper DOF is based on the
 405  * size of a user_addr_t.  We need to recognize both
 406  * U32 and U64 as the same action.
 407  */
 408 #define DTRACEHIOC_ADDDOF_U32       _IOW('h', 4, user32_addr_t)
 409 #define DTRACEHIOC_ADDDOF_U64       _IOW('h', 4, user64_addr_t)
 410 #endif  /* __arm64__ */
 411
 412 /*
 413  * DTrace Error Hashing
 414  *
 415  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
 416  * table.  This is very useful for checking coverage of tests that are
 417  * expected to induce DIF or DOF processing errors, and may be useful for
 418  * debugging problems in the DIF code generator or in DOF generation .  The
 419  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
 420  */
 421 #if DEBUG
 422 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
 423 static const char *dtrace_errlast;
 424 static kthread_t *dtrace_errthread;
 425 static lck_mtx_t dtrace_errlock;
 426 #endif
 427
 428 /*
 429  * DTrace Macros and Constants
 430  *
 431  * These are various macros that are useful in various spots in the
 432  * implementation, along with a few random constants that have no meaning
 433  * outside of the implementation.  There is no real structure to this cpp
 434  * mishmash -- but is there ever?
 435  */
 436
 437 #define DTRACE_GETSTR(hash, elm)        \
 438         (hash->dth_getstr(elm, hash->dth_stroffs))
 439
 440 #define DTRACE_HASHSTR(hash, elm)       \
 441         dtrace_hash_str(DTRACE_GETSTR(hash, elm))
 442
 443 #define DTRACE_HASHNEXT(hash, elm)      \
 444         (void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
 445
 446 #define DTRACE_HASHPREV(hash, elm)      \
 447         (void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
 448
 449 #define DTRACE_HASHEQ(hash, lhs, rhs)   \
 450         (strcmp(DTRACE_GETSTR(hash, lhs), \
 451             DTRACE_GETSTR(hash, rhs)) == 0)
 452
 453 #define DTRACE_AGGHASHSIZE_SLEW         17
 454
 455 #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
 456
 457 /*
 458  * The key for a thread-local variable consists of the lower 61 bits of the
 459  * current_thread(), plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
 460  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
 461  * equal to a variable identifier.  This is necessary (but not sufficient) to
 462  * assure that global associative arrays never collide with thread-local
 463  * variables.  To guarantee that they cannot collide, we must also define the
 464  * order for keying dynamic variables.  That order is:
 465  *
 466  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
 467  *
 468  * Because the variable-key and the tls-key are in orthogonal spaces, there is
 469  * no way for a global variable key signature to match a thread-local key
 470  * signature.
 471  */
 472 #if defined (__x86_64__)
 473 /* FIXME: two function calls!! */
 474 #define DTRACE_TLS_THRKEY(where) { \
 475         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 476         uint64_t thr = (uintptr_t)current_thread(); \
 477         ASSERT(intr < (1 << 3)); \
 478         (where) = ((thr + DIF_VARIABLE_MAX) & \
 479             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 480 }
 481 #elif defined(__arm__)
 482 /* FIXME: three function calls!!! */
 483 #define DTRACE_TLS_THRKEY(where) { \
 484         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 485         uint64_t thr = (uintptr_t)current_thread(); \
 486         uint_t pid = (uint_t)dtrace_proc_selfpid(); \
 487         ASSERT(intr < (1 << 3)); \
 488         (where) = (((thr << 32 | pid) + DIF_VARIABLE_MAX) & \
 489             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 490 }
 491 #elif defined (__arm64__)
 492 /* FIXME: two function calls!! */
 493 #define DTRACE_TLS_THRKEY(where) { \
 494         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 495         uint64_t thr = (uintptr_t)current_thread(); \
 496         ASSERT(intr < (1 << 3)); \
 497         (where) = ((thr + DIF_VARIABLE_MAX) & \
 498             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 499 }
 500 #else
 501 #error Unknown architecture
 502 #endif
 503
 504 #define DT_BSWAP_8(x)   ((x) & 0xff)
 505 #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
 506 #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
 507 #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
 508
 509 #define DT_MASK_LO 0x00000000FFFFFFFFULL
 510
 511 #define DTRACE_STORE(type, tomax, offset, what) \
 512         *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
 513
 514
 515 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
 516         if (addr & (MIN(size,4) - 1)) {                                 \
 517                 *flags |= CPU_DTRACE_BADALIGN;                          \
 518                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 519                 return (0);                                             \
 520         }
 521
 522 #define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz)               \
 523 do {                                                                    \
 524         if ((remp) != NULL) {                                           \
 525                 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr);    \
 526         }                                                               \
 527 } while (0)
 528
 529
 530 /*
 531  * Test whether a range of memory starting at testaddr of size testsz falls
 532  * within the range of memory described by addr, sz.  We take care to avoid
 533  * problems with overflow and underflow of the unsigned quantities, and
 534  * disallow all negative sizes.  Ranges of size 0 are allowed.
 535  */
 536 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
 537         ((testaddr) - (baseaddr) < (basesz) && \
 538         (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
 539         (testaddr) + (testsz) >= (testaddr))
 540
 541 /*
 542  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
 543  * alloc_sz on the righthand side of the comparison in order to avoid overflow
 544  * or underflow in the comparison with it.  This is simpler than the INRANGE
 545  * check above, because we know that the dtms_scratch_ptr is valid in the
 546  * range.  Allocations of size zero are allowed.
 547  */
 548 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
 549         ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
 550         (mstate)->dtms_scratch_ptr >= (alloc_sz))
 551
 552 #define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
 553
 554 #if defined (__x86_64__) || (defined (__arm__) || defined (__arm64__))
 555 #define DTRACE_LOADFUNC(bits)                                           \
 556 /*CSTYLED*/                                                             \
 557 uint##bits##_t dtrace_load##bits(uintptr_t addr);                       \
 558                                                                         \
 559 uint##bits##_t                                                          \
 560 dtrace_load##bits(uintptr_t addr)                                       \
 561 {                                                                       \
 562         size_t size = bits / NBBY;                                      \
 563         /*CSTYLED*/                                                     \
 564         uint##bits##_t rval = 0;                                        \
 565         int i;                                                          \
 566         volatile uint16_t *flags = (volatile uint16_t *)                \
 567             &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;                   \
 568                                                                         \
 569         DTRACE_ALIGNCHECK(addr, size, flags);                           \
 570                                                                         \
 571         for (i = 0; i < dtrace_toxranges; i++) {                        \
 572                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
 573                         continue;                                       \
 574                                                                         \
 575                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
 576                         continue;                                       \
 577                                                                         \
 578                 /*                                                      \
 579                  * This address falls within a toxic region; return 0.  \
 580                  */                                                     \
 581                 *flags |= CPU_DTRACE_BADADDR;                           \
 582                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 583                 return (0);                                             \
 584         }                                                               \
 585                                                                         \
 586         {                                                               \
 587         volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits;          \
 588         *flags |= CPU_DTRACE_NOFAULT;                                   \
 589         recover = dtrace_sign_and_set_thread_recover(current_thread(), recover);        \
 590         /*CSTYLED*/                                                     \
 591         /*                                                              \
 592         * PR6394061 - avoid device memory that is unpredictably         \
 593         * mapped and unmapped                                           \
 594         */                                                              \
 595         if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr)))         \
 596             rval = *((volatile uint##bits##_t *)addr);                  \
 597         else {                                                          \
 598                 *flags |= CPU_DTRACE_BADADDR;                           \
 599                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 600                 return (0);                                             \
 601         }                                                               \
 602                                                                         \
 603         RECOVER_LABEL(bits);                                            \
 604         (void)dtrace_set_thread_recover(current_thread(), recover);     \
 605         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
 606         }                                                               \
 607                                                                         \
 608         return (rval);                                                  \
 609 }
 610 #else /* all other architectures */
 611 #error Unknown Architecture
 612 #endif
 613
 614 #ifdef __LP64__
 615 #define dtrace_loadptr  dtrace_load64
 616 #else
 617 #define dtrace_loadptr  dtrace_load32
 618 #endif
 619
 620 #define DTRACE_DYNHASH_FREE     0
 621 #define DTRACE_DYNHASH_SINK     1
 622 #define DTRACE_DYNHASH_VALID    2
 623
 624 #define DTRACE_MATCH_FAIL       -1
 625 #define DTRACE_MATCH_NEXT       0
 626 #define DTRACE_MATCH_DONE       1
 627 #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
 628 #define DTRACE_STATE_ALIGN      64
 629
 630 #define DTRACE_FLAGS2FLT(flags)                                         \
 631         (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :           \
 632         ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :                \
 633         ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :            \
 634         ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :                \
 635         ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :                \
 636         ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :         \
 637         ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :         \
 638         ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :       \
 639         ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :         \
 640         DTRACEFLT_UNKNOWN)
 641
 642 #define DTRACEACT_ISSTRING(act)                                         \
 643         ((act)->dta_kind == DTRACEACT_DIFEXPR &&                        \
 644         (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
 645
 646
 647 static size_t dtrace_strlen(const char *, size_t);
 648 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
 649 static void dtrace_enabling_provide(dtrace_provider_t *);
 650 static int dtrace_enabling_match(dtrace_enabling_t *, int *, dtrace_match_cond_t *cond);
 651 static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond);
 652 static void dtrace_enabling_matchall(void);
 653 static dtrace_state_t *dtrace_anon_grab(void);
 654 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
 655     dtrace_state_t *, uint64_t, uint64_t);
 656 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
 657 static void dtrace_buffer_drop(dtrace_buffer_t *);
 658 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
 659     dtrace_state_t *, dtrace_mstate_t *);
 660 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
 661     dtrace_optval_t);
 662 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *, void *);
 663 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
 664 static int dtrace_canload_remains(uint64_t, size_t, size_t *,
 665         dtrace_mstate_t *, dtrace_vstate_t *);
 666 static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
 667         dtrace_mstate_t *, dtrace_vstate_t *);
 668
 669
 670 /*
 671  * DTrace sysctl handlers
 672  *
 673  * These declarations and functions are used for a deeper DTrace configuration.
 674  * Most of them are not per-consumer basis and may impact the other DTrace
 675  * consumers.  Correctness may not be supported for all the variables, so you
 676  * should be careful about what values you are using.
 677  */
 678
 679 SYSCTL_DECL(_kern_dtrace);
 680 SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "dtrace");
 681
 682 static int
 683 sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
 684 {
 685 #pragma unused(oidp, arg2)
 686         int changed, error;
 687         int value = *(int *) arg1;
 688
 689         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 690         if (error || !changed)
 691                 return (error);
 692
 693         if (value != 0 && value != 1)
 694                 return (ERANGE);
 695
 696         lck_mtx_lock(&dtrace_lock);
 697                 dtrace_err_verbose = value;
 698         lck_mtx_unlock(&dtrace_lock);
 699
 700         return (0);
 701 }
 702
 703 /*
 704  * kern.dtrace.err_verbose
 705  *
 706  * Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
 707  * Errors are reported when a DIFO or a DOF has been rejected by the kernel.
 708  */
 709 SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
 710         CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 711         &dtrace_err_verbose, 0,
 712         sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
 713
 714 static int
 715 sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
 716 {
 717 #pragma unused(oidp, arg2, req)
 718         int changed, error;
 719         uint64_t value = *(uint64_t *) arg1;
 720
 721         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 722         if (error || !changed)
 723                 return (error);
 724
 725         if (value <= dtrace_buffer_memory_inuse)
 726                 return (ERANGE);
 727
 728         lck_mtx_lock(&dtrace_lock);
 729                 dtrace_buffer_memory_maxsize = value;
 730         lck_mtx_unlock(&dtrace_lock);
 731
 732         return (0);
 733 }
 734
 735 /*
 736  * kern.dtrace.buffer_memory_maxsize
 737  *
 738  * Set DTrace maximal size in bytes used by all the consumers' state buffers.  By default
 739  * the limit is PHYS_MEM / 3 for *all* consumers.  Attempting to set a null, a negative value
 740  * or a value <= to dtrace_buffer_memory_inuse will result in a failure.
 741  */
 742 SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
 743         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 744         &dtrace_buffer_memory_maxsize, 0,
 745         sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
 746
 747 /*
 748  * kern.dtrace.buffer_memory_inuse
 749  *
 750  * Current state buffer memory used, in bytes, by all the DTrace consumers.
 751  * This value is read-only.
 752  */
 753 SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD | CTLFLAG_LOCKED,
 754         &dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
 755
 756 static int
 757 sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
 758 {
 759 #pragma unused(oidp, arg2, req)
 760         int changed, error;
 761         size_t value = *(size_t*) arg1;
 762
 763         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 764         if (error || !changed)
 765                 return (error);
 766
 767         if (value <= 0)
 768                 return (ERANGE);
 769
 770         lck_mtx_lock(&dtrace_lock);
 771                 dtrace_difo_maxsize = value;
 772         lck_mtx_unlock(&dtrace_lock);
 773
 774         return (0);
 775 }
 776
 777 /*
 778  * kern.dtrace.difo_maxsize
 779  *
 780  * Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
 781  * to get the default value.  Attempting to set a null or negative size will
 782  * result in a failure.
 783  */
 784 SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
 785         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 786         &dtrace_difo_maxsize, 0,
 787         sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
 788
 789 static int
 790 sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
 791 {
 792 #pragma unused(oidp, arg2, req)
 793         int changed, error;
 794         dtrace_optval_t value = *(dtrace_optval_t *) arg1;
 795
 796         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 797         if (error || !changed)
 798                 return (error);
 799
 800         if (value <= 0)
 801                 return (ERANGE);
 802
 803         if (value >= dtrace_copy_maxsize())
 804                 return (ERANGE);
 805
 806         lck_mtx_lock(&dtrace_lock);
 807                 dtrace_dof_maxsize = value;
 808         lck_mtx_unlock(&dtrace_lock);
 809
 810         return (0);
 811 }
 812
 813 /*
 814  * kern.dtrace.dof_maxsize
 815  *
 816  * Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
 817  * get the default value.  Attempting to set a null or negative size will result
 818  * in a failure.
 819  */
 820 SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
 821         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 822         &dtrace_dof_maxsize, 0,
 823         sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
 824
 825 static int
 826 sysctl_dtrace_statvar_maxsize SYSCTL_HANDLER_ARGS
 827 {
 828 #pragma unused(oidp, arg2, req)
 829         int changed, error;
 830         dtrace_optval_t value = *(dtrace_optval_t*) arg1;
 831
 832         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 833         if (error || !changed)
 834                 return (error);
 835
 836         if (value <= 0)
 837                 return (ERANGE);
 838         if (value > dtrace_statvar_maxsize_max)
 839                 return (ERANGE);
 840
 841         lck_mtx_lock(&dtrace_lock);
 842                 dtrace_statvar_maxsize = value;
 843         lck_mtx_unlock(&dtrace_lock);
 844
 845         return (0);
 846 }
 847
 848 /*
 849  * kern.dtrace.global_maxsize
 850  *
 851  * Set the variable max size in bytes, check the definition of
 852  * dtrace_statvar_maxsize to get the default value.  Attempting to set a null,
 853  * too high or negative size will result in a failure.
 854  */
 855 SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
 856         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 857         &dtrace_statvar_maxsize, 0,
 858         sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
 859
 860
 861 /*
 862  * kern.dtrace.provide_private_probes
 863  *
 864  * Set whether the providers must provide the private probes.  This is
 865  * kept as compatibility as they are always provided.
 866  */
 867 SYSCTL_INT(_kern_dtrace, OID_AUTO, provide_private_probes,
 868         CTLFLAG_RD | CTLFLAG_LOCKED,
 869         (int *)NULL, 1, "provider must provide the private probes");
 870
 871 /*
 872  * kern.dtrace.dof_mode
 873  *
 874  * Returns the current DOF mode.
 875  * This value is read-only.
 876  */
 877 SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD | CTLFLAG_LOCKED,
 878         &dtrace_dof_mode, 0, "dtrace dof mode");
 879
 880 /*
 881  * DTrace Probe Context Functions
 882  *
 883  * These functions are called from probe context.  Because probe context is
 884  * any context in which C may be called, arbitrarily locks may be held,
 885  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
 886  * As a result, functions called from probe context may only call other DTrace
 887  * support functions -- they may not interact at all with the system at large.
 888  * (Note that the ASSERT macro is made probe-context safe by redefining it in
 889  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
 890  * loads are to be performed from probe context, they _must_ be in terms of
 891  * the safe dtrace_load*() variants.
 892  *
 893  * Some functions in this block are not actually called from probe context;
 894  * for these functions, there will be a comment above the function reading
 895  * "Note:  not called from probe context."
 896  */
 897
 898 int
 899 dtrace_assfail(const char *a, const char *f, int l)
 900 {
 901         panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
 902
 903         /*
 904          * We just need something here that even the most clever compiler
 905          * cannot optimize away.
 906          */
 907         return (a[(uintptr_t)f]);
 908 }
 909
 910 /*
 911  * Atomically increment a specified error counter from probe context.
 912  */
 913 static void
 914 dtrace_error(uint32_t *counter)
 915 {
 916         /*
 917          * Most counters stored to in probe context are per-CPU counters.
 918          * However, there are some error conditions that are sufficiently
 919          * arcane that they don't merit per-CPU storage.  If these counters
 920          * are incremented concurrently on different CPUs, scalability will be
 921          * adversely affected -- but we don't expect them to be white-hot in a
 922          * correctly constructed enabling...
 923          */
 924         uint32_t oval, nval;
 925
 926         do {
 927                 oval = *counter;
 928
 929                 if ((nval = oval + 1) == 0) {
 930                         /*
 931                          * If the counter would wrap, set it to 1 -- assuring
 932                          * that the counter is never zero when we have seen
 933                          * errors.  (The counter must be 32-bits because we
 934                          * aren't guaranteed a 64-bit compare&swap operation.)
 935                          * To save this code both the infamy of being fingered
 936                          * by a priggish news story and the indignity of being
 937                          * the target of a neo-puritan witch trial, we're
 938                          * carefully avoiding any colorful description of the
 939                          * likelihood of this condition -- but suffice it to
 940                          * say that it is only slightly more likely than the
 941                          * overflow of predicate cache IDs, as discussed in
 942                          * dtrace_predicate_create().
 943                          */
 944                         nval = 1;
 945                 }
 946         } while (dtrace_cas32(counter, oval, nval) != oval);
 947 }
 948
 949 /*
 950  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
 951  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
 952  */
 953 DTRACE_LOADFUNC(8)
 954 DTRACE_LOADFUNC(16)
 955 DTRACE_LOADFUNC(32)
 956 DTRACE_LOADFUNC(64)
 957
 958 static int
 959 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
 960 {
 961         if (dest < mstate->dtms_scratch_base)
 962                 return (0);
 963
 964         if (dest + size < dest)
 965                 return (0);
 966
 967         if (dest + size > mstate->dtms_scratch_ptr)
 968                 return (0);
 969
 970         return (1);
 971 }
 972
 973 static int
 974 dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
 975     dtrace_statvar_t **svars, int nsvars)
 976 {
 977         int i;
 978
 979         size_t maxglobalsize, maxlocalsize;
 980
 981         maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
 982         maxlocalsize = (maxglobalsize) * NCPU;
 983
 984         if (nsvars == 0)
 985                 return (0);
 986
 987         for (i = 0; i < nsvars; i++) {
 988                 dtrace_statvar_t *svar = svars[i];
 989                 uint8_t scope;
 990                 size_t size;
 991
 992                 if (svar == NULL || (size = svar->dtsv_size) == 0)
 993                         continue;
 994
 995                 scope = svar->dtsv_var.dtdv_scope;
 996
 997                 /**
 998                  * We verify that our size is valid in the spirit of providing
 999                  * defense in depth:  we want to prevent attackers from using
1000                  * DTrace to escalate an orthogonal kernel heap corruption bug
1001                  * into the ability to store to arbitrary locations in memory.
1002                  */
1003                 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
1004                         (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
1005
1006                 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) {
1007                         DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
1008                                 svar->dtsv_size);
1009                         return (1);
1010                 }
1011         }
1012
1013         return (0);
1014 }
1015
1016 /*
1017  * Check to see if the address is within a memory region to which a store may
1018  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
1019  * region.  The caller of dtrace_canstore() is responsible for performing any
1020  * alignment checks that are needed before stores are actually executed.
1021  */
1022 static int
1023 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1024     dtrace_vstate_t *vstate)
1025 {
1026         return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
1027 }
1028 /*
1029  * Implementation of dtrace_canstore which communicates the upper bound of the
1030  * allowed memory region.
1031  */
1032 static int
1033 dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
1034         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1035 {
1036         /*
1037          * First, check to see if the address is in scratch space...
1038          */
1039         if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
1040             mstate->dtms_scratch_size)) {
1041                 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
1042                         mstate->dtms_scratch_size);
1043                 return (1);
1044         }
1045         /*
1046          * Now check to see if it's a dynamic variable.  This check will pick
1047          * up both thread-local variables and any global dynamically-allocated
1048          * variables.
1049          */
1050         if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
1051             vstate->dtvs_dynvars.dtds_size)) {
1052                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
1053                 uintptr_t base = (uintptr_t)dstate->dtds_base +
1054                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
1055                 uintptr_t chunkoffs;
1056                 dtrace_dynvar_t *dvar;
1057
1058                 /*
1059                  * Before we assume that we can store here, we need to make
1060                  * sure that it isn't in our metadata -- storing to our
1061                  * dynamic variable metadata would corrupt our state.  For
1062                  * the range to not include any dynamic variable metadata,
1063                  * it must:
1064                  *
1065                  *      (1) Start above the hash table that is at the base of
1066                  *      the dynamic variable space
1067                  *
1068                  *      (2) Have a starting chunk offset that is beyond the
1069                  *      dtrace_dynvar_t that is at the base of every chunk
1070                  *
1071                  *      (3) Not span a chunk boundary
1072                  *
1073                  *      (4) Not be in the tuple space of a dynamic variable
1074                  *
1075                  */
1076                 if (addr < base)
1077                         return (0);
1078
1079                 chunkoffs = (addr - base) % dstate->dtds_chunksize;
1080
1081                 if (chunkoffs < sizeof (dtrace_dynvar_t))
1082                         return (0);
1083
1084                 if (chunkoffs + sz > dstate->dtds_chunksize)
1085                         return (0);
1086
1087                 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
1088
1089                 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
1090                         return (0);
1091
1092                 if (chunkoffs < sizeof (dtrace_dynvar_t) +
1093                         ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
1094                         return (0);
1095
1096                 return (1);
1097         }
1098
1099         /*
1100          * Finally, check the static local and global variables.  These checks
1101          * take the longest, so we perform them last.
1102          */
1103         if (dtrace_canstore_statvar(addr, sz, remain,
1104             vstate->dtvs_locals, vstate->dtvs_nlocals))
1105                 return (1);
1106
1107         if (dtrace_canstore_statvar(addr, sz, remain,
1108             vstate->dtvs_globals, vstate->dtvs_nglobals))
1109                 return (1);
1110
1111         return (0);
1112 }
1113
1114
1115 /*
1116  * Convenience routine to check to see if the address is within a memory
1117  * region in which a load may be issued given the user's privilege level;
1118  * if not, it sets the appropriate error flags and loads 'addr' into the
1119  * illegal value slot.
1120  *
1121  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
1122  * appropriate memory access protection.
1123  */
1124 int
1125 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1126     dtrace_vstate_t *vstate)
1127 {
1128         return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
1129 }
1130
1131 /*
1132  * Implementation of dtrace_canload which communicates the upper bound of the
1133  * allowed memory region.
1134  */
1135 static int
1136 dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
1137         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1138 {
1139         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
1140
1141         /*
1142          * If we hold the privilege to read from kernel memory, then
1143          * everything is readable.
1144          */
1145         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1146                 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1147                 return (1);
1148         }
1149
1150         /*
1151          * You can obviously read that which you can store.
1152          */
1153         if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
1154                 return (1);
1155
1156         /*
1157          * We're allowed to read from our own string table.
1158          */
1159         if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1160             mstate->dtms_difo->dtdo_strlen)) {
1161                 DTRACE_RANGE_REMAIN(remain, addr,
1162                         mstate->dtms_difo->dtdo_strtab,
1163                         mstate->dtms_difo->dtdo_strlen);
1164                 return (1);
1165         }
1166
1167         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1168         *illval = addr;
1169         return (0);
1170 }
1171
1172 /*
1173  * Convenience routine to check to see if a given string is within a memory
1174  * region in which a load may be issued given the user's privilege level;
1175  * this exists so that we don't need to issue unnecessary dtrace_strlen()
1176  * calls in the event that the user has all privileges.
1177  */
1178 static int
1179 dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1180         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1181 {
1182         size_t rsize;
1183
1184         /*
1185          * If we hold the privilege to read from kernel memory, then
1186          * everything is readable.
1187          */
1188         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1189                 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1190                 return (1);
1191         }
1192
1193         /*
1194          * Even if the caller is uninterested in querying the remaining valid
1195          * range, it is required to ensure that the access is allowed.
1196          */
1197         if (remain == NULL) {
1198                 remain = &rsize;
1199         }
1200         if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
1201                 size_t strsz;
1202                 /*
1203                  * Perform the strlen after determining the length of the
1204                  * memory region which is accessible.  This prevents timing
1205                  * information from being used to find NULs in memory which is
1206                  * not accessible to the caller.
1207                  */
1208                 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
1209                         MIN(sz, *remain));
1210                 if (strsz <= *remain) {
1211                         return (1);
1212                 }
1213         }
1214
1215         return (0);
1216 }
1217
1218 /*
1219  * Convenience routine to check to see if a given variable is within a memory
1220  * region in which a load may be issued given the user's privilege level.
1221  */
1222 static int
1223 dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
1224         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1225 {
1226         size_t sz;
1227         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1228
1229         /*
1230          * Calculate the max size before performing any checks since even
1231          * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1232          * return the max length via 'remain'.
1233          */
1234         if (type->dtdt_kind == DIF_TYPE_STRING) {
1235                 dtrace_state_t *state = vstate->dtvs_state;
1236
1237                 if (state != NULL) {
1238                         sz = state->dts_options[DTRACEOPT_STRSIZE];
1239                 } else {
1240                         /*
1241                          * In helper context, we have a NULL state; fall back
1242                          * to using the system-wide default for the string size
1243                          * in this case.
1244                          */
1245                         sz = dtrace_strsize_default;
1246                 }
1247         } else {
1248                 sz = type->dtdt_size;
1249         }
1250
1251         /*
1252          * If we hold the privilege to read from kernel memory, then
1253          * everything is readable.
1254          */
1255         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1256                 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1257                 return (1);
1258         }
1259
1260         if (type->dtdt_kind == DIF_TYPE_STRING) {
1261                 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1262                         vstate));
1263         }
1264         return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1265                 vstate));
1266 }
1267
1268 #define isdigit(ch)     ((ch) >= '0' && (ch) <= '9')
1269 #define islower(ch)     ((ch) >= 'a' && (ch) <= 'z')
1270 #define isspace(ch)     (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \
1271                         ((ch) == '\t') || ((ch) == '\f'))
1272 #define isxdigit(ch)    (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
1273                         ((ch) >= 'A' && (ch) <= 'F'))
1274 #define lisalnum(x)     \
1275         (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))
1276
1277 #define DIGIT(x)        \
1278         (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
1279
1280 /*
1281  * Convert a string to a signed integer using safe loads.
1282  */
1283 static int64_t
1284 dtrace_strtoll(char *input, int base, size_t limit)
1285 {
1286         uintptr_t pos = (uintptr_t)input;
1287         int64_t val = 0;
1288         int x;
1289         boolean_t neg = B_FALSE;
1290         char c, cc, ccc;
1291         uintptr_t end = pos + limit;
1292
1293         /*
1294          * Consume any whitespace preceding digits.
1295          */
1296         while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1297                 pos++;
1298
1299         /*
1300          * Handle an explicit sign if one is present.
1301          */
1302         if (c == '-' || c == '+') {
1303                 if (c == '-')
1304                         neg = B_TRUE;
1305                 c = dtrace_load8(++pos);
1306         }
1307
1308         /*
1309          * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1310          * if present.
1311          */
1312         if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1313             cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1314                 pos += 2;
1315                 c = ccc;
1316         }
1317
1318         /*
1319          * Read in contiguous digits until the first non-digit character.
1320          */
1321         for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1322             c = dtrace_load8(++pos))
1323                 val = val * base + x;
1324
1325         return (neg ? -val : val);
1326 }
1327
1328
1329 /*
1330  * Compare two strings using safe loads.
1331  */
1332 static int
1333 dtrace_strncmp(const char *s1, const char *s2, size_t limit)
1334 {
1335         uint8_t c1, c2;
1336         volatile uint16_t *flags;
1337
1338         if (s1 == s2 || limit == 0)
1339                 return (0);
1340
1341         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1342
1343         do {
1344                 if (s1 == NULL) {
1345                         c1 = '\0';
1346                 } else {
1347                         c1 = dtrace_load8((uintptr_t)s1++);
1348                 }
1349
1350                 if (s2 == NULL) {
1351                         c2 = '\0';
1352                 } else {
1353                         c2 = dtrace_load8((uintptr_t)s2++);
1354                 }
1355
1356                 if (c1 != c2)
1357                         return (c1 - c2);
1358         } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1359
1360         return (0);
1361 }
1362
1363 /*
1364  * Compute strlen(s) for a string using safe memory accesses.  The additional
1365  * len parameter is used to specify a maximum length to ensure completion.
1366  */
1367 static size_t
1368 dtrace_strlen(const char *s, size_t lim)
1369 {
1370         uint_t len;
1371
1372         for (len = 0; len != lim; len++) {
1373                 if (dtrace_load8((uintptr_t)s++) == '\0')
1374                         break;
1375         }
1376
1377         return (len);
1378 }
1379
1380 /*
1381  * Check if an address falls within a toxic region.
1382  */
1383 static int
1384 dtrace_istoxic(uintptr_t kaddr, size_t size)
1385 {
1386         uintptr_t taddr, tsize;
1387         int i;
1388
1389         for (i = 0; i < dtrace_toxranges; i++) {
1390                 taddr = dtrace_toxrange[i].dtt_base;
1391                 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1392
1393                 if (kaddr - taddr < tsize) {
1394                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1395                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1396                         return (1);
1397                 }
1398
1399                 if (taddr - kaddr < size) {
1400                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1401                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1402                         return (1);
1403                 }
1404         }
1405
1406         return (0);
1407 }
1408
1409 /*
1410  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1411  * memory specified by the DIF program.  The dst is assumed to be safe memory
1412  * that we can store to directly because it is managed by DTrace.  As with
1413  * standard bcopy, overlapping copies are handled properly.
1414  */
1415 static void
1416 dtrace_bcopy(const void *src, void *dst, size_t len)
1417 {
1418         if (len != 0) {
1419                 uint8_t *s1 = dst;
1420                 const uint8_t *s2 = src;
1421
1422                 if (s1 <= s2) {
1423                         do {
1424                                 *s1++ = dtrace_load8((uintptr_t)s2++);
1425                         } while (--len != 0);
1426                 } else {
1427                         s2 += len;
1428                         s1 += len;
1429
1430                         do {
1431                                 *--s1 = dtrace_load8((uintptr_t)--s2);
1432                         } while (--len != 0);
1433                 }
1434         }
1435 }
1436
1437 /*
1438  * Copy src to dst using safe memory accesses, up to either the specified
1439  * length, or the point that a nul byte is encountered.  The src is assumed to
1440  * be unsafe memory specified by the DIF program.  The dst is assumed to be
1441  * safe memory that we can store to directly because it is managed by DTrace.
1442  * Unlike dtrace_bcopy(), overlapping regions are not handled.
1443  */
1444 static void
1445 dtrace_strcpy(const void *src, void *dst, size_t len)
1446 {
1447         if (len != 0) {
1448                 uint8_t *s1 = dst, c;
1449                 const uint8_t *s2 = src;
1450
1451                 do {
1452                         *s1++ = c = dtrace_load8((uintptr_t)s2++);
1453                 } while (--len != 0 && c != '\0');
1454         }
1455 }
1456
1457 /*
1458  * Copy src to dst, deriving the size and type from the specified (BYREF)
1459  * variable type.  The src is assumed to be unsafe memory specified by the DIF
1460  * program.  The dst is assumed to be DTrace variable memory that is of the
1461  * specified type; we assume that we can store to directly.
1462  */
1463 static void
1464 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1465 {
1466         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1467
1468         if (type->dtdt_kind == DIF_TYPE_STRING) {
1469                 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1470         } else {
1471                 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1472         }
1473 }
1474
1475 /*
1476  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1477  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1478  * safe memory that we can access directly because it is managed by DTrace.
1479  */
1480 static int
1481 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1482 {
1483         volatile uint16_t *flags;
1484
1485         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1486
1487         if (s1 == s2)
1488                 return (0);
1489
1490         if (s1 == NULL || s2 == NULL)
1491                 return (1);
1492
1493         if (s1 != s2 && len != 0) {
1494                 const uint8_t *ps1 = s1;
1495                 const uint8_t *ps2 = s2;
1496
1497                 do {
1498                         if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1499                                 return (1);
1500                 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1501         }
1502         return (0);
1503 }
1504
1505 /*
1506  * Zero the specified region using a simple byte-by-byte loop.  Note that this
1507  * is for safe DTrace-managed memory only.
1508  */
1509 static void
1510 dtrace_bzero(void *dst, size_t len)
1511 {
1512         uchar_t *cp;
1513
1514         for (cp = dst; len != 0; len--)
1515                 *cp++ = 0;
1516 }
1517
1518 static void
1519 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1520 {
1521         uint64_t result[2];
1522
1523         result[0] = addend1[0] + addend2[0];
1524         result[1] = addend1[1] + addend2[1] +
1525             (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1526
1527         sum[0] = result[0];
1528         sum[1] = result[1];
1529 }
1530
1531 /*
1532  * Shift the 128-bit value in a by b. If b is positive, shift left.
1533  * If b is negative, shift right.
1534  */
1535 static void
1536 dtrace_shift_128(uint64_t *a, int b)
1537 {
1538         uint64_t mask;
1539
1540         if (b == 0)
1541                 return;
1542
1543         if (b < 0) {
1544                 b = -b;
1545                 if (b >= 64) {
1546                         a[0] = a[1] >> (b - 64);
1547                         a[1] = 0;
1548                 } else {
1549                         a[0] >>= b;
1550                         mask = 1LL << (64 - b);
1551                         mask -= 1;
1552                         a[0] |= ((a[1] & mask) << (64 - b));
1553                         a[1] >>= b;
1554                 }
1555         } else {
1556                 if (b >= 64) {
1557                         a[1] = a[0] << (b - 64);
1558                         a[0] = 0;
1559                 } else {
1560                         a[1] <<= b;
1561                         mask = a[0] >> (64 - b);
1562                         a[1] |= mask;
1563                         a[0] <<= b;
1564                 }
1565         }
1566 }
1567
1568 /*
1569  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1570  * use native multiplication on those, and then re-combine into the
1571  * resulting 128-bit value.
1572  *
1573  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1574  *     hi1 * hi2 << 64 +
1575  *     hi1 * lo2 << 32 +
1576  *     hi2 * lo1 << 32 +
1577  *     lo1 * lo2
1578  */
1579 static void
1580 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1581 {
1582         uint64_t hi1, hi2, lo1, lo2;
1583         uint64_t tmp[2];
1584
1585         hi1 = factor1 >> 32;
1586         hi2 = factor2 >> 32;
1587
1588         lo1 = factor1 & DT_MASK_LO;
1589         lo2 = factor2 & DT_MASK_LO;
1590
1591         product[0] = lo1 * lo2;
1592         product[1] = hi1 * hi2;
1593
1594         tmp[0] = hi1 * lo2;
1595         tmp[1] = 0;
1596         dtrace_shift_128(tmp, 32);
1597         dtrace_add_128(product, tmp, product);
1598
1599         tmp[0] = hi2 * lo1;
1600         tmp[1] = 0;
1601         dtrace_shift_128(tmp, 32);
1602         dtrace_add_128(product, tmp, product);
1603 }
1604
1605 /*
1606  * This privilege check should be used by actions and subroutines to
1607  * verify that the user credentials of the process that enabled the
1608  * invoking ECB match the target credentials
1609  */
1610 static int
1611 dtrace_priv_proc_common_user(dtrace_state_t *state)
1612 {
1613         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1614
1615         /*
1616          * We should always have a non-NULL state cred here, since if cred
1617          * is null (anonymous tracing), we fast-path bypass this routine.
1618          */
1619         ASSERT(s_cr != NULL);
1620
1621         if ((cr = dtrace_CRED()) != NULL &&
1622             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1623             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1624             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1625             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1626             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1627             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1628                 return (1);
1629
1630         return (0);
1631 }
1632
1633 /*
1634  * This privilege check should be used by actions and subroutines to
1635  * verify that the zone of the process that enabled the invoking ECB
1636  * matches the target credentials
1637  */
1638 static int
1639 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1640 {
1641         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1642 #pragma unused(cr, s_cr, state) /* __APPLE__ */
1643
1644         /*
1645          * We should always have a non-NULL state cred here, since if cred
1646          * is null (anonymous tracing), we fast-path bypass this routine.
1647          */
1648         ASSERT(s_cr != NULL);
1649
1650         return 1; /* APPLE NOTE: Darwin doesn't do zones. */
1651 }
1652
1653 /*
1654  * This privilege check should be used by actions and subroutines to
1655  * verify that the process has not setuid or changed credentials.
1656  */
1657 static int
1658 dtrace_priv_proc_common_nocd(void)
1659 {
1660         return 1; /* Darwin omits "No Core Dump" flag. */
1661 }
1662
1663 static int
1664 dtrace_priv_proc_destructive(dtrace_state_t *state)
1665 {
1666         int action = state->dts_cred.dcr_action;
1667
1668         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1669                 goto bad;
1670
1671         if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1672                 goto bad;
1673
1674         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1675             dtrace_priv_proc_common_zone(state) == 0)
1676                 goto bad;
1677
1678         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1679             dtrace_priv_proc_common_user(state) == 0)
1680                 goto bad;
1681
1682         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1683             dtrace_priv_proc_common_nocd() == 0)
1684                 goto bad;
1685
1686         return (1);
1687
1688 bad:
1689         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1690
1691         return (0);
1692 }
1693
1694 static int
1695 dtrace_priv_proc_control(dtrace_state_t *state)
1696 {
1697         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1698                 goto bad;
1699
1700         if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1701                 goto bad;
1702
1703         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1704                 return (1);
1705
1706         if (dtrace_priv_proc_common_zone(state) &&
1707             dtrace_priv_proc_common_user(state) &&
1708             dtrace_priv_proc_common_nocd())
1709                 return (1);
1710
1711 bad:
1712         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1713
1714         return (0);
1715 }
1716
1717 static int
1718 dtrace_priv_proc(dtrace_state_t *state)
1719 {
1720         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1721                 goto bad;
1722
1723         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc()))
1724                 goto bad;
1725
1726         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1727                 return (1);
1728
1729 bad:
1730         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1731
1732         return (0);
1733 }
1734
1735 /*
1736  * The P_LNOATTACH check is an Apple specific check.
1737  * We need a version of dtrace_priv_proc() that omits
1738  * that check for PID and EXECNAME accesses
1739  */
1740 static int
1741 dtrace_priv_proc_relaxed(dtrace_state_t *state)
1742 {
1743
1744         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1745                 return (1);
1746
1747         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1748
1749         return (0);
1750 }
1751
1752 static int
1753 dtrace_priv_kernel(dtrace_state_t *state)
1754 {
1755         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
1756                 goto bad;
1757
1758         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1759                 return (1);
1760
1761 bad:
1762         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1763
1764         return (0);
1765 }
1766
1767 static int
1768 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1769 {
1770         if (dtrace_is_restricted())
1771                 goto bad;
1772
1773         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1774                 return (1);
1775
1776 bad:
1777         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1778
1779         return (0);
1780 }
1781
1782 /*
1783  * Note:  not called from probe context.  This function is called
1784  * asynchronously (and at a regular interval) from outside of probe context to
1785  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1786  * cleaning is explained in detail in <sys/dtrace_impl.h>.
1787  */
1788 static void
1789 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1790 {
1791         dtrace_dynvar_t *dirty;
1792         dtrace_dstate_percpu_t *dcpu;
1793         int i, work = 0;
1794
1795         for (i = 0; i < (int)NCPU; i++) {
1796                 dcpu = &dstate->dtds_percpu[i];
1797
1798                 ASSERT(dcpu->dtdsc_rinsing == NULL);
1799
1800                 /*
1801                  * If the dirty list is NULL, there is no dirty work to do.
1802                  */
1803                 if (dcpu->dtdsc_dirty == NULL)
1804                         continue;
1805
1806                 /*
1807                  * If the clean list is non-NULL, then we're not going to do
1808                  * any work for this CPU -- it means that there has not been
1809                  * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1810                  * since the last time we cleaned house.
1811                  */
1812                 if (dcpu->dtdsc_clean != NULL)
1813                         continue;
1814
1815                 work = 1;
1816
1817                 /*
1818                  * Atomically move the dirty list aside.
1819                  */
1820                 do {
1821                         dirty = dcpu->dtdsc_dirty;
1822
1823                         /*
1824                          * Before we zap the dirty list, set the rinsing list.
1825                          * (This allows for a potential assertion in
1826                          * dtrace_dynvar():  if a free dynamic variable appears
1827                          * on a hash chain, either the dirty list or the
1828                          * rinsing list for some CPU must be non-NULL.)
1829                          */
1830                         dcpu->dtdsc_rinsing = dirty;
1831                         dtrace_membar_producer();
1832                 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1833                     dirty, NULL) != dirty);
1834         }
1835
1836         if (!work) {
1837                 /*
1838                  * We have no work to do; we can simply return.
1839                  */
1840                 return;
1841         }
1842
1843         dtrace_sync();
1844
1845         for (i = 0; i < (int)NCPU; i++) {
1846                 dcpu = &dstate->dtds_percpu[i];
1847
1848                 if (dcpu->dtdsc_rinsing == NULL)
1849                         continue;
1850
1851                 /*
1852                  * We are now guaranteed that no hash chain contains a pointer
1853                  * into this dirty list; we can make it clean.
1854                  */
1855                 ASSERT(dcpu->dtdsc_clean == NULL);
1856                 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1857                 dcpu->dtdsc_rinsing = NULL;
1858         }
1859
1860         /*
1861          * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1862          * sure that all CPUs have seen all of the dtdsc_clean pointers.
1863          * This prevents a race whereby a CPU incorrectly decides that
1864          * the state should be something other than DTRACE_DSTATE_CLEAN
1865          * after dtrace_dynvar_clean() has completed.
1866          */
1867         dtrace_sync();
1868
1869         dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1870 }
1871
1872 /*
1873  * Depending on the value of the op parameter, this function looks-up,
1874  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1875  * allocation is requested, this function will return a pointer to a
1876  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1877  * variable can be allocated.  If NULL is returned, the appropriate counter
1878  * will be incremented.
1879  */
1880 static dtrace_dynvar_t *
1881 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1882     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1883     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1884 {
1885         uint64_t hashval = DTRACE_DYNHASH_VALID;
1886         dtrace_dynhash_t *hash = dstate->dtds_hash;
1887         dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1888         processorid_t me = CPU->cpu_id, cpu = me;
1889         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1890         size_t bucket, ksize;
1891         size_t chunksize = dstate->dtds_chunksize;
1892         uintptr_t kdata, lock, nstate;
1893         uint_t i;
1894
1895         ASSERT(nkeys != 0);
1896
1897         /*
1898          * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1899          * algorithm.  For the by-value portions, we perform the algorithm in
1900          * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1901          * bit, and seems to have only a minute effect on distribution.  For
1902          * the by-reference data, we perform "One-at-a-time" iterating (safely)
1903          * over each referenced byte.  It's painful to do this, but it's much
1904          * better than pathological hash distribution.  The efficacy of the
1905          * hashing algorithm (and a comparison with other algorithms) may be
1906          * found by running the ::dtrace_dynstat MDB dcmd.
1907          */
1908         for (i = 0; i < nkeys; i++) {
1909                 if (key[i].dttk_size == 0) {
1910                         uint64_t val = key[i].dttk_value;
1911
1912                         hashval += (val >> 48) & 0xffff;
1913                         hashval += (hashval << 10);
1914                         hashval ^= (hashval >> 6);
1915
1916                         hashval += (val >> 32) & 0xffff;
1917                         hashval += (hashval << 10);
1918                         hashval ^= (hashval >> 6);
1919
1920                         hashval += (val >> 16) & 0xffff;
1921                         hashval += (hashval << 10);
1922                         hashval ^= (hashval >> 6);
1923
1924                         hashval += val & 0xffff;
1925                         hashval += (hashval << 10);
1926                         hashval ^= (hashval >> 6);
1927                 } else {
1928                         /*
1929                          * This is incredibly painful, but it beats the hell
1930                          * out of the alternative.
1931                          */
1932                         uint64_t j, size = key[i].dttk_size;
1933                         uintptr_t base = (uintptr_t)key[i].dttk_value;
1934
1935                         if (!dtrace_canload(base, size, mstate, vstate))
1936                                 break;
1937
1938                         for (j = 0; j < size; j++) {
1939                                 hashval += dtrace_load8(base + j);
1940                                 hashval += (hashval << 10);
1941                                 hashval ^= (hashval >> 6);
1942                         }
1943                 }
1944         }
1945
1946         if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1947                 return (NULL);
1948
1949         hashval += (hashval << 3);
1950         hashval ^= (hashval >> 11);
1951         hashval += (hashval << 15);
1952
1953         /*
1954          * There is a remote chance (ideally, 1 in 2^31) that our hashval
1955          * comes out to be one of our two sentinel hash values.  If this
1956          * actually happens, we set the hashval to be a value known to be a
1957          * non-sentinel value.
1958          */
1959         if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1960                 hashval = DTRACE_DYNHASH_VALID;
1961
1962         /*
1963          * Yes, it's painful to do a divide here.  If the cycle count becomes
1964          * important here, tricks can be pulled to reduce it.  (However, it's
1965          * critical that hash collisions be kept to an absolute minimum;
1966          * they're much more painful than a divide.)  It's better to have a
1967          * solution that generates few collisions and still keeps things
1968          * relatively simple.
1969          */
1970         bucket = hashval % dstate->dtds_hashsize;
1971
1972         if (op == DTRACE_DYNVAR_DEALLOC) {
1973                 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1974
1975                 for (;;) {
1976                         while ((lock = *lockp) & 1)
1977                                 continue;
1978
1979                         if (dtrace_casptr((void *)(uintptr_t)lockp,
1980                             (void *)lock, (void *)(lock + 1)) == (void *)lock)
1981                                 break;
1982                 }
1983
1984                 dtrace_membar_producer();
1985         }
1986
1987 top:
1988         prev = NULL;
1989         lock = hash[bucket].dtdh_lock;
1990
1991         dtrace_membar_consumer();
1992
1993         start = hash[bucket].dtdh_chain;
1994         ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1995             start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1996             op != DTRACE_DYNVAR_DEALLOC));
1997
1998         for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1999                 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
2000                 dtrace_key_t *dkey = &dtuple->dtt_key[0];
2001
2002                 if (dvar->dtdv_hashval != hashval) {
2003                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
2004                                 /*
2005                                  * We've reached the sink, and therefore the
2006                                  * end of the hash chain; we can kick out of
2007                                  * the loop knowing that we have seen a valid
2008                                  * snapshot of state.
2009                                  */
2010                                 ASSERT(dvar->dtdv_next == NULL);
2011                                 ASSERT(dvar == &dtrace_dynhash_sink);
2012                                 break;
2013                         }
2014
2015                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
2016                                 /*
2017                                  * We've gone off the rails:  somewhere along
2018                                  * the line, one of the members of this hash
2019                                  * chain was deleted.  Note that we could also
2020                                  * detect this by simply letting this loop run
2021                                  * to completion, as we would eventually hit
2022                                  * the end of the dirty list.  However, we
2023                                  * want to avoid running the length of the
2024                                  * dirty list unnecessarily (it might be quite
2025                                  * long), so we catch this as early as
2026                                  * possible by detecting the hash marker.  In
2027                                  * this case, we simply set dvar to NULL and
2028                                  * break; the conditional after the loop will
2029                                  * send us back to top.
2030                                  */
2031                                 dvar = NULL;
2032                                 break;
2033                         }
2034
2035                         goto next;
2036                 }
2037
2038                 if (dtuple->dtt_nkeys != nkeys)
2039                         goto next;
2040
2041                 for (i = 0; i < nkeys; i++, dkey++) {
2042                         if (dkey->dttk_size != key[i].dttk_size)
2043                                 goto next; /* size or type mismatch */
2044
2045                         if (dkey->dttk_size != 0) {
2046                                 if (dtrace_bcmp(
2047                                     (void *)(uintptr_t)key[i].dttk_value,
2048                                     (void *)(uintptr_t)dkey->dttk_value,
2049                                     dkey->dttk_size))
2050                                         goto next;
2051                         } else {
2052                                 if (dkey->dttk_value != key[i].dttk_value)
2053                                         goto next;
2054                         }
2055                 }
2056
2057                 if (op != DTRACE_DYNVAR_DEALLOC)
2058                         return (dvar);
2059
2060                 ASSERT(dvar->dtdv_next == NULL ||
2061                     dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
2062
2063                 if (prev != NULL) {
2064                         ASSERT(hash[bucket].dtdh_chain != dvar);
2065                         ASSERT(start != dvar);
2066                         ASSERT(prev->dtdv_next == dvar);
2067                         prev->dtdv_next = dvar->dtdv_next;
2068                 } else {
2069                         if (dtrace_casptr(&hash[bucket].dtdh_chain,
2070                             start, dvar->dtdv_next) != start) {
2071                                 /*
2072                                  * We have failed to atomically swing the
2073                                  * hash table head pointer, presumably because
2074                                  * of a conflicting allocation on another CPU.
2075                                  * We need to reread the hash chain and try
2076                                  * again.
2077                                  */
2078                                 goto top;
2079                         }
2080                 }
2081
2082                 dtrace_membar_producer();
2083
2084                 /*
2085                  * Now set the hash value to indicate that it's free.
2086                  */
2087                 ASSERT(hash[bucket].dtdh_chain != dvar);
2088                 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2089
2090                 dtrace_membar_producer();
2091
2092                 /*
2093                  * Set the next pointer to point at the dirty list, and
2094                  * atomically swing the dirty pointer to the newly freed dvar.
2095                  */
2096                 do {
2097                         next = dcpu->dtdsc_dirty;
2098                         dvar->dtdv_next = next;
2099                 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2100
2101                 /*
2102                  * Finally, unlock this hash bucket.
2103                  */
2104                 ASSERT(hash[bucket].dtdh_lock == lock);
2105                 ASSERT(lock & 1);
2106                 hash[bucket].dtdh_lock++;
2107
2108                 return (NULL);
2109 next:
2110                 prev = dvar;
2111                 continue;
2112         }
2113
2114         if (dvar == NULL) {
2115                 /*
2116                  * If dvar is NULL, it is because we went off the rails:
2117                  * one of the elements that we traversed in the hash chain
2118                  * was deleted while we were traversing it.  In this case,
2119                  * we assert that we aren't doing a dealloc (deallocs lock
2120                  * the hash bucket to prevent themselves from racing with
2121                  * one another), and retry the hash chain traversal.
2122                  */
2123                 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2124                 goto top;
2125         }
2126
2127         if (op != DTRACE_DYNVAR_ALLOC) {
2128                 /*
2129                  * If we are not to allocate a new variable, we want to
2130                  * return NULL now.  Before we return, check that the value
2131                  * of the lock word hasn't changed.  If it has, we may have
2132                  * seen an inconsistent snapshot.
2133                  */
2134                 if (op == DTRACE_DYNVAR_NOALLOC) {
2135                         if (hash[bucket].dtdh_lock != lock)
2136                                 goto top;
2137                 } else {
2138                         ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2139                         ASSERT(hash[bucket].dtdh_lock == lock);
2140                         ASSERT(lock & 1);
2141                         hash[bucket].dtdh_lock++;
2142                 }
2143
2144                 return (NULL);
2145         }
2146
2147         /*
2148          * We need to allocate a new dynamic variable.  The size we need is the
2149          * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2150          * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2151          * the size of any referred-to data (dsize).  We then round the final
2152          * size up to the chunksize for allocation.
2153          */
2154         for (ksize = 0, i = 0; i < nkeys; i++)
2155                 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2156
2157         /*
2158          * This should be pretty much impossible, but could happen if, say,
2159          * strange DIF specified the tuple.  Ideally, this should be an
2160          * assertion and not an error condition -- but that requires that the
2161          * chunksize calculation in dtrace_difo_chunksize() be absolutely
2162          * bullet-proof.  (That is, it must not be able to be fooled by
2163          * malicious DIF.)  Given the lack of backwards branches in DIF,
2164          * solving this would presumably not amount to solving the Halting
2165          * Problem -- but it still seems awfully hard.
2166          */
2167         if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2168             ksize + dsize > chunksize) {
2169                 dcpu->dtdsc_drops++;
2170                 return (NULL);
2171         }
2172
2173         nstate = DTRACE_DSTATE_EMPTY;
2174
2175         do {
2176 retry:
2177                 free = dcpu->dtdsc_free;
2178
2179                 if (free == NULL) {
2180                         dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2181                         void *rval;
2182
2183                         if (clean == NULL) {
2184                                 /*
2185                                  * We're out of dynamic variable space on
2186                                  * this CPU.  Unless we have tried all CPUs,
2187                                  * we'll try to allocate from a different
2188                                  * CPU.
2189                                  */
2190                                 switch (dstate->dtds_state) {
2191                                 case DTRACE_DSTATE_CLEAN: {
2192                                         void *sp = &dstate->dtds_state;
2193
2194                                         if (++cpu >= (int)NCPU)
2195                                                 cpu = 0;
2196
2197                                         if (dcpu->dtdsc_dirty != NULL &&
2198                                             nstate == DTRACE_DSTATE_EMPTY)
2199                                                 nstate = DTRACE_DSTATE_DIRTY;
2200
2201                                         if (dcpu->dtdsc_rinsing != NULL)
2202                                                 nstate = DTRACE_DSTATE_RINSING;
2203
2204                                         dcpu = &dstate->dtds_percpu[cpu];
2205
2206                                         if (cpu != me)
2207                                                 goto retry;
2208
2209                                         (void) dtrace_cas32(sp,
2210                                             DTRACE_DSTATE_CLEAN, nstate);
2211
2212                                         /*
2213                                          * To increment the correct bean
2214                                          * counter, take another lap.
2215                                          */
2216                                         goto retry;
2217                                 }
2218
2219                                 case DTRACE_DSTATE_DIRTY:
2220                                         dcpu->dtdsc_dirty_drops++;
2221                                         break;
2222
2223                                 case DTRACE_DSTATE_RINSING:
2224                                         dcpu->dtdsc_rinsing_drops++;
2225                                         break;
2226
2227                                 case DTRACE_DSTATE_EMPTY:
2228                                         dcpu->dtdsc_drops++;
2229                                         break;
2230                                 }
2231
2232                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2233                                 return (NULL);
2234                         }
2235
2236                         /*
2237                          * The clean list appears to be non-empty.  We want to
2238                          * move the clean list to the free list; we start by
2239                          * moving the clean pointer aside.
2240                          */
2241                         if (dtrace_casptr(&dcpu->dtdsc_clean,
2242                             clean, NULL) != clean) {
2243                                 /*
2244                                  * We are in one of two situations:
2245                                  *
2246                                  *  (a) The clean list was switched to the
2247                                  *      free list by another CPU.
2248                                  *
2249                                  *  (b) The clean list was added to by the
2250                                  *      cleansing cyclic.
2251                                  *
2252                                  * In either of these situations, we can
2253                                  * just reattempt the free list allocation.
2254                                  */
2255                                 goto retry;
2256                         }
2257
2258                         ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2259
2260                         /*
2261                          * Now we'll move the clean list to the free list.
2262                          * It's impossible for this to fail:  the only way
2263                          * the free list can be updated is through this
2264                          * code path, and only one CPU can own the clean list.
2265                          * Thus, it would only be possible for this to fail if
2266                          * this code were racing with dtrace_dynvar_clean().
2267                          * (That is, if dtrace_dynvar_clean() updated the clean
2268                          * list, and we ended up racing to update the free
2269                          * list.)  This race is prevented by the dtrace_sync()
2270                          * in dtrace_dynvar_clean() -- which flushes the
2271                          * owners of the clean lists out before resetting
2272                          * the clean lists.
2273                          */
2274                         rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2275                         ASSERT(rval == NULL);
2276                         goto retry;
2277                 }
2278
2279                 dvar = free;
2280                 new_free = dvar->dtdv_next;
2281         } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2282
2283         /*
2284          * We have now allocated a new chunk.  We copy the tuple keys into the
2285          * tuple array and copy any referenced key data into the data space
2286          * following the tuple array.  As we do this, we relocate dttk_value
2287          * in the final tuple to point to the key data address in the chunk.
2288          */
2289         kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2290         dvar->dtdv_data = (void *)(kdata + ksize);
2291         dvar->dtdv_tuple.dtt_nkeys = nkeys;
2292
2293         for (i = 0; i < nkeys; i++) {
2294                 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2295                 size_t kesize = key[i].dttk_size;
2296
2297                 if (kesize != 0) {
2298                         dtrace_bcopy(
2299                             (const void *)(uintptr_t)key[i].dttk_value,
2300                             (void *)kdata, kesize);
2301                         dkey->dttk_value = kdata;
2302                         kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2303                 } else {
2304                         dkey->dttk_value = key[i].dttk_value;
2305                 }
2306
2307                 dkey->dttk_size = kesize;
2308         }
2309
2310         ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2311         dvar->dtdv_hashval = hashval;
2312         dvar->dtdv_next = start;
2313
2314         if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2315                 return (dvar);
2316
2317         /*
2318          * The cas has failed.  Either another CPU is adding an element to
2319          * this hash chain, or another CPU is deleting an element from this
2320          * hash chain.  The simplest way to deal with both of these cases
2321          * (though not necessarily the most efficient) is to free our
2322          * allocated block and tail-call ourselves.  Note that the free is
2323          * to the dirty list and _not_ to the free list.  This is to prevent
2324          * races with allocators, above.
2325          */
2326         dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2327
2328         dtrace_membar_producer();
2329
2330         do {
2331                 free = dcpu->dtdsc_dirty;
2332                 dvar->dtdv_next = free;
2333         } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2334
2335         return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2336 }
2337
2338 /*ARGSUSED*/
2339 static void
2340 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2341 {
2342 #pragma unused(arg) /* __APPLE__ */
2343         if ((int64_t)nval < (int64_t)*oval)
2344                 *oval = nval;
2345 }
2346
2347 /*ARGSUSED*/
2348 static void
2349 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2350 {
2351 #pragma unused(arg) /* __APPLE__ */
2352         if ((int64_t)nval > (int64_t)*oval)
2353                 *oval = nval;
2354 }
2355
2356 static void
2357 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2358 {
2359         int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2360         int64_t val = (int64_t)nval;
2361
2362         if (val < 0) {
2363                 for (i = 0; i < zero; i++) {
2364                         if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2365                                 quanta[i] += incr;
2366                                 return;
2367                         }
2368                 }
2369         } else {
2370                 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2371                         if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2372                                 quanta[i - 1] += incr;
2373                                 return;
2374                         }
2375                 }
2376
2377                 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2378                 return;
2379         }
2380
2381         ASSERT(0);
2382 }
2383
2384 static void
2385 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2386 {
2387         uint64_t arg = *lquanta++;
2388         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2389         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2390         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2391         int32_t val = (int32_t)nval, level;
2392
2393         ASSERT(step != 0);
2394         ASSERT(levels != 0);
2395
2396         if (val < base) {
2397                 /*
2398                  * This is an underflow.
2399                  */
2400                 lquanta[0] += incr;
2401                 return;
2402         }
2403
2404         level = (val - base) / step;
2405
2406         if (level < levels) {
2407                 lquanta[level + 1] += incr;
2408                 return;
2409         }
2410
2411         /*
2412          * This is an overflow.
2413          */
2414         lquanta[levels + 1] += incr;
2415 }
2416
2417 static int
2418 dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2419                                    int16_t nsteps, int64_t value)
2420 {
2421         int64_t this = 1, last, next;
2422         int base = 1, order;
2423
2424         for (order = 0; order < low; ++order)
2425                 this *= factor;
2426
2427         /*
2428          * If our value is less than our factor taken to the power of the
2429          * low order of magnitude, it goes into the zeroth bucket.
2430          */
2431         if (value < this)
2432                 return 0;
2433         else
2434                 last = this;
2435
2436         for (this *= factor; order <= high; ++order) {
2437                 int nbuckets = this > nsteps ? nsteps : this;
2438
2439                 /*
2440                  * We should not generally get log/linear quantizations
2441                  * with a high magnitude that allows 64-bits to
2442                  * overflow, but we nonetheless protect against this
2443                  * by explicitly checking for overflow, and clamping
2444                  * our value accordingly.
2445                  */
2446                 next = this * factor;
2447                 if (next < this) {
2448                         value = this - 1;
2449                 }
2450
2451                 /*
2452                  * If our value lies within this order of magnitude,
2453                  * determine its position by taking the offset within
2454                  * the order of magnitude, dividing by the bucket
2455                  * width, and adding to our (accumulated) base.
2456                  */
2457                 if (value < this) {
2458                         return (base + (value - last) / (this / nbuckets));
2459                 }
2460
2461                 base += nbuckets - (nbuckets / factor);
2462                 last = this;
2463                 this = next;
2464         }
2465
2466         /*
2467          * Our value is greater than or equal to our factor taken to the
2468          * power of one plus the high magnitude -- return the top bucket.
2469          */
2470         return base;
2471 }
2472
2473 static void
2474 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2475 {
2476         uint64_t arg    = *llquanta++;
2477         uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2478         uint16_t low    = DTRACE_LLQUANTIZE_LOW(arg);
2479         uint16_t high   = DTRACE_LLQUANTIZE_HIGH(arg);
2480         uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2481
2482         llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2483 }
2484
2485 /*ARGSUSED*/
2486 static void
2487 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2488 {
2489 #pragma unused(arg) /* __APPLE__ */
2490         data[0]++;
2491         data[1] += nval;
2492 }
2493
2494 /*ARGSUSED*/
2495 static void
2496 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2497 {
2498 #pragma unused(arg) /* __APPLE__ */
2499         int64_t snval = (int64_t)nval;
2500         uint64_t tmp[2];
2501
2502         data[0]++;
2503         data[1] += nval;
2504
2505         /*
2506          * What we want to say here is:
2507          *
2508          * data[2] += nval * nval;
2509          *
2510          * But given that nval is 64-bit, we could easily overflow, so
2511          * we do this as 128-bit arithmetic.
2512          */
2513         if (snval < 0)
2514                 snval = -snval;
2515
2516         dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2517         dtrace_add_128(data + 2, tmp, data + 2);
2518 }
2519
2520 /*ARGSUSED*/
2521 static void
2522 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2523 {
2524 #pragma unused(nval, arg) /* __APPLE__ */
2525         *oval = *oval + 1;
2526 }
2527
2528 /*ARGSUSED*/
2529 static void
2530 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2531 {
2532 #pragma unused(arg) /* __APPLE__ */
2533         *oval += nval;
2534 }
2535
2536 /*
2537  * Aggregate given the tuple in the principal data buffer, and the aggregating
2538  * action denoted by the specified dtrace_aggregation_t.  The aggregation
2539  * buffer is specified as the buf parameter.  This routine does not return
2540  * failure; if there is no space in the aggregation buffer, the data will be
2541  * dropped, and a corresponding counter incremented.
2542  */
2543 static void
2544 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2545     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2546 {
2547 #pragma unused(arg)
2548         dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2549         uint32_t i, ndx, size, fsize;
2550         uint32_t align = sizeof (uint64_t) - 1;
2551         dtrace_aggbuffer_t *agb;
2552         dtrace_aggkey_t *key;
2553         uint32_t hashval = 0, limit, isstr;
2554         caddr_t tomax, data, kdata;
2555         dtrace_actkind_t action;
2556         dtrace_action_t *act;
2557         uintptr_t offs;
2558
2559         if (buf == NULL)
2560                 return;
2561
2562         if (!agg->dtag_hasarg) {
2563                 /*
2564                  * Currently, only quantize() and lquantize() take additional
2565                  * arguments, and they have the same semantics:  an increment
2566                  * value that defaults to 1 when not present.  If additional
2567                  * aggregating actions take arguments, the setting of the
2568                  * default argument value will presumably have to become more
2569                  * sophisticated...
2570                  */
2571                 arg = 1;
2572         }
2573
2574         action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2575         size = rec->dtrd_offset - agg->dtag_base;
2576         fsize = size + rec->dtrd_size;
2577
2578         ASSERT(dbuf->dtb_tomax != NULL);
2579         data = dbuf->dtb_tomax + offset + agg->dtag_base;
2580
2581         if ((tomax = buf->dtb_tomax) == NULL) {
2582                 dtrace_buffer_drop(buf);
2583                 return;
2584         }
2585
2586         /*
2587          * The metastructure is always at the bottom of the buffer.
2588          */
2589         agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2590             sizeof (dtrace_aggbuffer_t));
2591
2592         if (buf->dtb_offset == 0) {
2593                 /*
2594                  * We just kludge up approximately 1/8th of the size to be
2595                  * buckets.  If this guess ends up being routinely
2596                  * off-the-mark, we may need to dynamically readjust this
2597                  * based on past performance.
2598                  */
2599                 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2600
2601                 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2602                     (uintptr_t)tomax || hashsize == 0) {
2603                         /*
2604                          * We've been given a ludicrously small buffer;
2605                          * increment our drop count and leave.
2606                          */
2607                         dtrace_buffer_drop(buf);
2608                         return;
2609                 }
2610
2611                 /*
2612                  * And now, a pathetic attempt to try to get a an odd (or
2613                  * perchance, a prime) hash size for better hash distribution.
2614                  */
2615                 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2616                         hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2617
2618                 agb->dtagb_hashsize = hashsize;
2619                 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2620                     agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2621                 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2622
2623                 for (i = 0; i < agb->dtagb_hashsize; i++)
2624                         agb->dtagb_hash[i] = NULL;
2625         }
2626
2627         ASSERT(agg->dtag_first != NULL);
2628         ASSERT(agg->dtag_first->dta_intuple);
2629
2630         /*
2631          * Calculate the hash value based on the key.  Note that we _don't_
2632          * include the aggid in the hashing (but we will store it as part of
2633          * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2634          * algorithm: a simple, quick algorithm that has no known funnels, and
2635          * gets good distribution in practice.  The efficacy of the hashing
2636          * algorithm (and a comparison with other algorithms) may be found by
2637          * running the ::dtrace_aggstat MDB dcmd.
2638          */
2639         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2640                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2641                 limit = i + act->dta_rec.dtrd_size;
2642                 ASSERT(limit <= size);
2643                 isstr = DTRACEACT_ISSTRING(act);
2644
2645                 for (; i < limit; i++) {
2646                         hashval += data[i];
2647                         hashval += (hashval << 10);
2648                         hashval ^= (hashval >> 6);
2649
2650                         if (isstr && data[i] == '\0')
2651                                 break;
2652                 }
2653         }
2654
2655         hashval += (hashval << 3);
2656         hashval ^= (hashval >> 11);
2657         hashval += (hashval << 15);
2658
2659         /*
2660          * Yes, the divide here is expensive -- but it's generally the least
2661          * of the performance issues given the amount of data that we iterate
2662          * over to compute hash values, compare data, etc.
2663          */
2664         ndx = hashval % agb->dtagb_hashsize;
2665
2666         for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2667                 ASSERT((caddr_t)key >= tomax);
2668                 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2669
2670                 if (hashval != key->dtak_hashval || key->dtak_size != size)
2671                         continue;
2672
2673                 kdata = key->dtak_data;
2674                 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2675
2676                 for (act = agg->dtag_first; act->dta_intuple;
2677                     act = act->dta_next) {
2678                         i = act->dta_rec.dtrd_offset - agg->dtag_base;
2679                         limit = i + act->dta_rec.dtrd_size;
2680                         ASSERT(limit <= size);
2681                         isstr = DTRACEACT_ISSTRING(act);
2682
2683                         for (; i < limit; i++) {
2684                                 if (kdata[i] != data[i])
2685                                         goto next;
2686
2687                                 if (isstr && data[i] == '\0')
2688                                         break;
2689                         }
2690                 }
2691
2692                 if (action != key->dtak_action) {
2693                         /*
2694                          * We are aggregating on the same value in the same
2695                          * aggregation with two different aggregating actions.
2696                          * (This should have been picked up in the compiler,
2697                          * so we may be dealing with errant or devious DIF.)
2698                          * This is an error condition; we indicate as much,
2699                          * and return.
2700                          */
2701                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2702                         return;
2703                 }
2704
2705                 /*
2706                  * This is a hit:  we need to apply the aggregator to
2707                  * the value at this key.
2708                  */
2709                 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2710                 return;
2711 next:
2712                 continue;
2713         }
2714
2715         /*
2716          * We didn't find it.  We need to allocate some zero-filled space,
2717          * link it into the hash table appropriately, and apply the aggregator
2718          * to the (zero-filled) value.
2719          */
2720         offs = buf->dtb_offset;
2721         while (offs & (align - 1))
2722                 offs += sizeof (uint32_t);
2723
2724         /*
2725          * If we don't have enough room to both allocate a new key _and_
2726          * its associated data, increment the drop count and return.
2727          */
2728         if ((uintptr_t)tomax + offs + fsize >
2729             agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2730                 dtrace_buffer_drop(buf);
2731                 return;
2732         }
2733
2734         /*CONSTCOND*/
2735         ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2736         key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2737         agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2738
2739         key->dtak_data = kdata = tomax + offs;
2740         buf->dtb_offset = offs + fsize;
2741
2742         /*
2743          * Now copy the data across.
2744          */
2745         *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2746
2747         for (i = sizeof (dtrace_aggid_t); i < size; i++)
2748                 kdata[i] = data[i];
2749
2750         /*
2751          * Because strings are not zeroed out by default, we need to iterate
2752          * looking for actions that store strings, and we need to explicitly
2753          * pad these strings out with zeroes.
2754          */
2755         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2756                 int nul;
2757
2758                 if (!DTRACEACT_ISSTRING(act))
2759                         continue;
2760
2761                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2762                 limit = i + act->dta_rec.dtrd_size;
2763                 ASSERT(limit <= size);
2764
2765                 for (nul = 0; i < limit; i++) {
2766                         if (nul) {
2767                                 kdata[i] = '\0';
2768                                 continue;
2769                         }
2770
2771                         if (data[i] != '\0')
2772                                 continue;
2773
2774                         nul = 1;
2775                 }
2776         }
2777
2778         for (i = size; i < fsize; i++)
2779                 kdata[i] = 0;
2780
2781         key->dtak_hashval = hashval;
2782         key->dtak_size = size;
2783         key->dtak_action = action;
2784         key->dtak_next = agb->dtagb_hash[ndx];
2785         agb->dtagb_hash[ndx] = key;
2786
2787         /*
2788          * Finally, apply the aggregator.
2789          */
2790         *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2791         agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2792 }
2793
2794 /*
2795  * Given consumer state, this routine finds a speculation in the INACTIVE
2796  * state and transitions it into the ACTIVE state.  If there is no speculation
2797  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2798  * incremented -- it is up to the caller to take appropriate action.
2799  */
2800 static int
2801 dtrace_speculation(dtrace_state_t *state)
2802 {
2803         int i = 0;
2804         dtrace_speculation_state_t current;
2805         uint32_t *stat = &state->dts_speculations_unavail, count;
2806
2807         while (i < state->dts_nspeculations) {
2808                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2809
2810                 current = spec->dtsp_state;
2811
2812                 if (current != DTRACESPEC_INACTIVE) {
2813                         if (current == DTRACESPEC_COMMITTINGMANY ||
2814                             current == DTRACESPEC_COMMITTING ||
2815                             current == DTRACESPEC_DISCARDING)
2816                                 stat = &state->dts_speculations_busy;
2817                         i++;
2818                         continue;
2819                 }
2820
2821                 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2822                     current, DTRACESPEC_ACTIVE) == current)
2823                         return (i + 1);
2824         }
2825
2826         /*
2827          * We couldn't find a speculation.  If we found as much as a single
2828          * busy speculation buffer, we'll attribute this failure as "busy"
2829          * instead of "unavail".
2830          */
2831         do {
2832                 count = *stat;
2833         } while (dtrace_cas32(stat, count, count + 1) != count);
2834
2835         return (0);
2836 }
2837
2838 /*
2839  * This routine commits an active speculation.  If the specified speculation
2840  * is not in a valid state to perform a commit(), this routine will silently do
2841  * nothing.  The state of the specified speculation is transitioned according
2842  * to the state transition diagram outlined in <sys/dtrace_impl.h>
2843  */
2844 static void
2845 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2846     dtrace_specid_t which)
2847 {
2848         dtrace_speculation_t *spec;
2849         dtrace_buffer_t *src, *dest;
2850         uintptr_t daddr, saddr, dlimit, slimit;
2851         dtrace_speculation_state_t current,  new = DTRACESPEC_INACTIVE;
2852         intptr_t offs;
2853         uint64_t timestamp;
2854
2855         if (which == 0)
2856                 return;
2857
2858         if (which > (dtrace_specid_t)state->dts_nspeculations) {
2859                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2860                 return;
2861         }
2862
2863         spec = &state->dts_speculations[which - 1];
2864         src = &spec->dtsp_buffer[cpu];
2865         dest = &state->dts_buffer[cpu];
2866
2867         do {
2868                 current = spec->dtsp_state;
2869
2870                 if (current == DTRACESPEC_COMMITTINGMANY)
2871                         break;
2872
2873                 switch (current) {
2874                 case DTRACESPEC_INACTIVE:
2875                 case DTRACESPEC_DISCARDING:
2876                         return;
2877
2878                 case DTRACESPEC_COMMITTING:
2879                         /*
2880                          * This is only possible if we are (a) commit()'ing
2881                          * without having done a prior speculate() on this CPU
2882                          * and (b) racing with another commit() on a different
2883                          * CPU.  There's nothing to do -- we just assert that
2884                          * our offset is 0.
2885                          */
2886                         ASSERT(src->dtb_offset == 0);
2887                         return;
2888
2889                 case DTRACESPEC_ACTIVE:
2890                         new = DTRACESPEC_COMMITTING;
2891                         break;
2892
2893                 case DTRACESPEC_ACTIVEONE:
2894                         /*
2895                          * This speculation is active on one CPU.  If our
2896                          * buffer offset is non-zero, we know that the one CPU
2897                          * must be us.  Otherwise, we are committing on a
2898                          * different CPU from the speculate(), and we must
2899                          * rely on being asynchronously cleaned.
2900                          */
2901                         if (src->dtb_offset != 0) {
2902                                 new = DTRACESPEC_COMMITTING;
2903                                 break;
2904                         }
2905                         /*FALLTHROUGH*/
2906
2907                 case DTRACESPEC_ACTIVEMANY:
2908                         new = DTRACESPEC_COMMITTINGMANY;
2909                         break;
2910
2911                 default:
2912                         ASSERT(0);
2913                 }
2914         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2915             current, new) != current);
2916
2917         /*
2918          * We have set the state to indicate that we are committing this
2919          * speculation.  Now reserve the necessary space in the destination
2920          * buffer.
2921          */
2922         if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2923             sizeof (uint64_t), state, NULL)) < 0) {
2924                 dtrace_buffer_drop(dest);
2925                 goto out;
2926         }
2927
2928         /*
2929          * We have sufficient space to copy the speculative buffer into the
2930          * primary buffer.  First, modify the speculative buffer, filling
2931          * in the timestamp of all entries with the current time.  The data
2932          * must have the commit() time rather than the time it was traced,
2933          * so that all entries in the primary buffer are in timestamp order.
2934          */
2935         timestamp = dtrace_gethrtime();
2936         saddr = (uintptr_t)src->dtb_tomax;
2937         slimit = saddr + src->dtb_offset;
2938         while (saddr < slimit) {
2939                 size_t size;
2940                 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2941
2942                 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2943                         saddr += sizeof (dtrace_epid_t);
2944                         continue;
2945                 }
2946
2947                 ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
2948                 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2949
2950                 ASSERT(saddr + size <= slimit);
2951                 ASSERT(size >= sizeof(dtrace_rechdr_t));
2952                 ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
2953
2954                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2955
2956                 saddr += size;
2957         }
2958
2959         /*
2960          * Copy the buffer across.  (Note that this is a
2961          * highly subobtimal bcopy(); in the unlikely event that this becomes
2962          * a serious performance issue, a high-performance DTrace-specific
2963          * bcopy() should obviously be invented.)
2964          */
2965         daddr = (uintptr_t)dest->dtb_tomax + offs;
2966         dlimit = daddr + src->dtb_offset;
2967         saddr = (uintptr_t)src->dtb_tomax;
2968
2969         /*
2970          * First, the aligned portion.
2971          */
2972         while (dlimit - daddr >= sizeof (uint64_t)) {
2973                 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2974
2975                 daddr += sizeof (uint64_t);
2976                 saddr += sizeof (uint64_t);
2977         }
2978
2979         /*
2980          * Now any left-over bit...
2981          */
2982         while (dlimit - daddr)
2983                 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2984
2985         /*
2986          * Finally, commit the reserved space in the destination buffer.
2987          */
2988         dest->dtb_offset = offs + src->dtb_offset;
2989
2990 out:
2991         /*
2992          * If we're lucky enough to be the only active CPU on this speculation
2993          * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2994          */
2995         if (current == DTRACESPEC_ACTIVE ||
2996             (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2997                 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2998                     DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2999 #pragma unused(rval) /* __APPLE__ */
3000
3001                 ASSERT(rval == DTRACESPEC_COMMITTING);
3002         }
3003
3004         src->dtb_offset = 0;
3005         src->dtb_xamot_drops += src->dtb_drops;
3006         src->dtb_drops = 0;
3007 }
3008
3009 /*
3010  * This routine discards an active speculation.  If the specified speculation
3011  * is not in a valid state to perform a discard(), this routine will silently
3012  * do nothing.  The state of the specified speculation is transitioned
3013  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
3014  */
3015 static void
3016 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
3017     dtrace_specid_t which)
3018 {
3019         dtrace_speculation_t *spec;
3020         dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3021         dtrace_buffer_t *buf;
3022
3023         if (which == 0)
3024                 return;
3025
3026         if (which > (dtrace_specid_t)state->dts_nspeculations) {
3027                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3028                 return;
3029         }
3030
3031         spec = &state->dts_speculations[which - 1];
3032         buf = &spec->dtsp_buffer[cpu];
3033
3034         do {
3035                 current = spec->dtsp_state;
3036
3037                 switch (current) {
3038                 case DTRACESPEC_INACTIVE:
3039                 case DTRACESPEC_COMMITTINGMANY:
3040                 case DTRACESPEC_COMMITTING:
3041                 case DTRACESPEC_DISCARDING:
3042                         return;
3043
3044                 case DTRACESPEC_ACTIVE:
3045                 case DTRACESPEC_ACTIVEMANY:
3046                         new = DTRACESPEC_DISCARDING;
3047                         break;
3048
3049                 case DTRACESPEC_ACTIVEONE:
3050                         if (buf->dtb_offset != 0) {
3051                                 new = DTRACESPEC_INACTIVE;
3052                         } else {
3053                                 new = DTRACESPEC_DISCARDING;
3054                         }
3055                         break;
3056
3057                 default:
3058                         ASSERT(0);
3059                 }
3060         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3061             current, new) != current);
3062
3063         buf->dtb_offset = 0;
3064         buf->dtb_drops = 0;
3065 }
3066
3067 /*
3068  * Note:  not called from probe context.  This function is called
3069  * asynchronously from cross call context to clean any speculations that are
3070  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
3071  * transitioned back to the INACTIVE state until all CPUs have cleaned the
3072  * speculation.
3073  */
3074 static void
3075 dtrace_speculation_clean_here(dtrace_state_t *state)
3076 {
3077         dtrace_icookie_t cookie;
3078         processorid_t cpu = CPU->cpu_id;
3079         dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3080         dtrace_specid_t i;
3081
3082         cookie = dtrace_interrupt_disable();
3083
3084         if (dest->dtb_tomax == NULL) {
3085                 dtrace_interrupt_enable(cookie);
3086                 return;
3087         }
3088
3089         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3090                 dtrace_speculation_t *spec = &state->dts_speculations[i];
3091                 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3092
3093                 if (src->dtb_tomax == NULL)
3094                         continue;
3095
3096                 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3097                         src->dtb_offset = 0;
3098                         continue;
3099                 }
3100
3101                 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3102                         continue;
3103
3104                 if (src->dtb_offset == 0)
3105                         continue;
3106
3107                 dtrace_speculation_commit(state, cpu, i + 1);
3108         }
3109
3110         dtrace_interrupt_enable(cookie);
3111 }
3112
3113 /*
3114  * Note:  not called from probe context.  This function is called
3115  * asynchronously (and at a regular interval) to clean any speculations that
3116  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
3117  * is work to be done, it cross calls all CPUs to perform that work;
3118  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
3119  * INACTIVE state until they have been cleaned by all CPUs.
3120  */
3121 static void
3122 dtrace_speculation_clean(dtrace_state_t *state)
3123 {
3124         int work = 0;
3125         uint32_t rv;
3126         dtrace_specid_t i;
3127
3128         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3129                 dtrace_speculation_t *spec = &state->dts_speculations[i];
3130
3131                 ASSERT(!spec->dtsp_cleaning);
3132
3133                 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3134                     spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3135                         continue;
3136
3137                 work++;
3138                 spec->dtsp_cleaning = 1;
3139         }
3140
3141         if (!work)
3142                 return;
3143
3144         dtrace_xcall(DTRACE_CPUALL,
3145             (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3146
3147         /*
3148          * We now know that all CPUs have committed or discarded their
3149          * speculation buffers, as appropriate.  We can now set the state
3150          * to inactive.
3151          */
3152         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3153                 dtrace_speculation_t *spec = &state->dts_speculations[i];
3154                 dtrace_speculation_state_t current, new;
3155
3156                 if (!spec->dtsp_cleaning)
3157                         continue;
3158
3159                 current = spec->dtsp_state;
3160                 ASSERT(current == DTRACESPEC_DISCARDING ||
3161                     current == DTRACESPEC_COMMITTINGMANY);
3162
3163                 new = DTRACESPEC_INACTIVE;
3164
3165                 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3166                 ASSERT(rv == current);
3167                 spec->dtsp_cleaning = 0;
3168         }
3169 }
3170
3171 /*
3172  * Called as part of a speculate() to get the speculative buffer associated
3173  * with a given speculation.  Returns NULL if the specified speculation is not
3174  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
3175  * the active CPU is not the specified CPU -- the speculation will be
3176  * atomically transitioned into the ACTIVEMANY state.
3177  */
3178 static dtrace_buffer_t *
3179 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3180     dtrace_specid_t which)
3181 {
3182         dtrace_speculation_t *spec;
3183         dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3184         dtrace_buffer_t *buf;
3185
3186         if (which == 0)
3187                 return (NULL);
3188
3189         if (which > (dtrace_specid_t)state->dts_nspeculations) {
3190                 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3191                 return (NULL);
3192         }
3193
3194         spec = &state->dts_speculations[which - 1];
3195         buf = &spec->dtsp_buffer[cpuid];
3196
3197         do {
3198                 current = spec->dtsp_state;
3199
3200                 switch (current) {
3201                 case DTRACESPEC_INACTIVE:
3202                 case DTRACESPEC_COMMITTINGMANY:
3203                 case DTRACESPEC_DISCARDING:
3204                         return (NULL);
3205
3206                 case DTRACESPEC_COMMITTING:
3207                         ASSERT(buf->dtb_offset == 0);
3208                         return (NULL);
3209
3210                 case DTRACESPEC_ACTIVEONE:
3211                         /*
3212                          * This speculation is currently active on one CPU.
3213                          * Check the offset in the buffer; if it's non-zero,
3214                          * that CPU must be us (and we leave the state alone).
3215                          * If it's zero, assume that we're starting on a new
3216                          * CPU -- and change the state to indicate that the
3217                          * speculation is active on more than one CPU.
3218                          */
3219                         if (buf->dtb_offset != 0)
3220                                 return (buf);
3221
3222                         new = DTRACESPEC_ACTIVEMANY;
3223                         break;
3224
3225                 case DTRACESPEC_ACTIVEMANY:
3226                         return (buf);
3227
3228                 case DTRACESPEC_ACTIVE:
3229                         new = DTRACESPEC_ACTIVEONE;
3230                         break;
3231
3232                 default:
3233                         ASSERT(0);
3234                 }
3235         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3236             current, new) != current);
3237
3238         ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3239         return (buf);
3240 }
3241
3242 /*
3243  * Return a string.  In the event that the user lacks the privilege to access
3244  * arbitrary kernel memory, we copy the string out to scratch memory so that we
3245  * don't fail access checking.
3246  *
3247  * dtrace_dif_variable() uses this routine as a helper for various
3248  * builtin values such as 'execname' and 'probefunc.'
3249  */
3250 static
3251 uintptr_t
3252 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3253     dtrace_mstate_t *mstate)
3254 {
3255         uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3256         uintptr_t ret;
3257         size_t strsz;
3258
3259         /*
3260          * The easy case: this probe is allowed to read all of memory, so
3261          * we can just return this as a vanilla pointer.
3262          */
3263         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3264                 return (addr);
3265
3266         /*
3267          * This is the tougher case: we copy the string in question from
3268          * kernel memory into scratch memory and return it that way: this
3269          * ensures that we won't trip up when access checking tests the
3270          * BYREF return value.
3271          */
3272         strsz = dtrace_strlen((char *)addr, size) + 1;
3273
3274         if (mstate->dtms_scratch_ptr + strsz >
3275             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3276                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3277                 return (0);
3278         }
3279
3280         dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3281             strsz);
3282         ret = mstate->dtms_scratch_ptr;
3283         mstate->dtms_scratch_ptr += strsz;
3284         return (ret);
3285 }
3286
3287 /*
3288  * This function implements the DIF emulator's variable lookups.  The emulator
3289  * passes a reserved variable identifier and optional built-in array index.
3290  */
3291 static uint64_t
3292 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3293     uint64_t ndx)
3294 {
3295         /*
3296          * If we're accessing one of the uncached arguments, we'll turn this
3297          * into a reference in the args array.
3298          */
3299         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3300                 ndx = v - DIF_VAR_ARG0;
3301                 v = DIF_VAR_ARGS;
3302         }
3303
3304         switch (v) {
3305         case DIF_VAR_ARGS:
3306                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3307                 if (ndx >= sizeof (mstate->dtms_arg) /
3308                     sizeof (mstate->dtms_arg[0])) {
3309                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3310                         dtrace_vstate_t *vstate = &state->dts_vstate;
3311                         dtrace_provider_t *pv;
3312                         uint64_t val;
3313
3314                         pv = mstate->dtms_probe->dtpr_provider;
3315                         if (pv->dtpv_pops.dtps_getargval != NULL)
3316                                 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3317                                     mstate->dtms_probe->dtpr_id,
3318                                     mstate->dtms_probe->dtpr_arg, ndx, aframes);
3319                         /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
3320                         else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
3321                                 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
3322                         }
3323
3324                         else
3325                                 val = dtrace_getarg(ndx, aframes, mstate, vstate);
3326
3327                         /*
3328                          * This is regrettably required to keep the compiler
3329                          * from tail-optimizing the call to dtrace_getarg().
3330                          * The condition always evaluates to true, but the
3331                          * compiler has no way of figuring that out a priori.
3332                          * (None of this would be necessary if the compiler
3333                          * could be relied upon to _always_ tail-optimize
3334                          * the call to dtrace_getarg() -- but it can't.)
3335                          */
3336                         if (mstate->dtms_probe != NULL)
3337                                 return (val);
3338
3339                         ASSERT(0);
3340                 }
3341
3342                 return (mstate->dtms_arg[ndx]);
3343
3344         case DIF_VAR_UREGS: {
3345                 thread_t thread;
3346
3347                 if (!dtrace_priv_proc(state))
3348                         return (0);
3349
3350                 if ((thread = current_thread()) == NULL) {
3351                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3352                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3353                         return (0);
3354                 }
3355
3356                 return (dtrace_getreg(find_user_regs(thread), ndx));
3357         }
3358
3359
3360         case DIF_VAR_CURTHREAD:
3361                 if (!dtrace_priv_kernel(state))
3362                         return (0);
3363
3364                 return ((uint64_t)(uintptr_t)current_thread());
3365
3366         case DIF_VAR_TIMESTAMP:
3367                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3368                         mstate->dtms_timestamp = dtrace_gethrtime();
3369                         mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3370                 }
3371                 return (mstate->dtms_timestamp);
3372
3373         case DIF_VAR_VTIMESTAMP:
3374                 ASSERT(dtrace_vtime_references != 0);
3375                 return (dtrace_get_thread_vtime(current_thread()));
3376
3377         case DIF_VAR_WALLTIMESTAMP:
3378                 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3379                         mstate->dtms_walltimestamp = dtrace_gethrestime();
3380                         mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3381                 }
3382                 return (mstate->dtms_walltimestamp);
3383
3384         case DIF_VAR_MACHTIMESTAMP:
3385                 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3386                         mstate->dtms_machtimestamp = mach_absolute_time();
3387                         mstate->dtms_present |= DTRACE_MSTATE_MACHTIMESTAMP;
3388                 }
3389                 return (mstate->dtms_machtimestamp);
3390
3391         case DIF_VAR_CPU:
3392                 return ((uint64_t) dtrace_get_thread_last_cpu_id(current_thread()));
3393
3394         case DIF_VAR_IPL:
3395                 if (!dtrace_priv_kernel(state))
3396                         return (0);
3397                 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3398                         mstate->dtms_ipl = dtrace_getipl();
3399                         mstate->dtms_present |= DTRACE_MSTATE_IPL;
3400                 }
3401                 return (mstate->dtms_ipl);
3402
3403         case DIF_VAR_EPID:
3404                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3405                 return (mstate->dtms_epid);
3406
3407         case DIF_VAR_ID:
3408                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3409                 return (mstate->dtms_probe->dtpr_id);
3410
3411         case DIF_VAR_STACKDEPTH:
3412                 if (!dtrace_priv_kernel(state))
3413                         return (0);
3414                 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3415                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3416
3417                         mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3418                         mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3419                 }
3420                 return (mstate->dtms_stackdepth);
3421
3422         case DIF_VAR_USTACKDEPTH:
3423                 if (!dtrace_priv_proc(state))
3424                         return (0);
3425                 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3426                         /*
3427                          * See comment in DIF_VAR_PID.
3428                          */
3429                         if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3430                             CPU_ON_INTR(CPU)) {
3431                                 mstate->dtms_ustackdepth = 0;
3432                         } else {
3433                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3434                                 mstate->dtms_ustackdepth =
3435                                     dtrace_getustackdepth();
3436                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3437                         }
3438                         mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3439                 }
3440                 return (mstate->dtms_ustackdepth);
3441
3442         case DIF_VAR_CALLER:
3443                 if (!dtrace_priv_kernel(state))
3444                         return (0);
3445                 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3446                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3447
3448                         if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3449                                 /*
3450                                  * If this is an unanchored probe, we are
3451                                  * required to go through the slow path:
3452                                  * dtrace_caller() only guarantees correct
3453                                  * results for anchored probes.
3454                                  */
3455                                 pc_t caller[2];
3456
3457                                 dtrace_getpcstack(caller, 2, aframes,
3458                                     (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3459                                 mstate->dtms_caller = caller[1];
3460                         } else if ((mstate->dtms_caller =
3461                                 dtrace_caller(aframes)) == (uintptr_t)-1) {
3462                                 /*
3463                                  * We have failed to do this the quick way;
3464                                  * we must resort to the slower approach of
3465                                  * calling dtrace_getpcstack().
3466                                  */
3467                                 pc_t caller;
3468
3469                                 dtrace_getpcstack(&caller, 1, aframes, NULL);
3470                                 mstate->dtms_caller = caller;
3471                         }
3472
3473                         mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3474                 }
3475                 return (mstate->dtms_caller);
3476
3477         case DIF_VAR_UCALLER:
3478                 if (!dtrace_priv_proc(state))
3479                         return (0);
3480
3481                 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3482                         uint64_t ustack[3];
3483
3484                         /*
3485                          * dtrace_getupcstack() fills in the first uint64_t
3486                          * with the current PID.  The second uint64_t will
3487                          * be the program counter at user-level.  The third
3488                          * uint64_t will contain the caller, which is what
3489                          * we're after.
3490                          */
3491                         ustack[2] = 0;
3492                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3493                         dtrace_getupcstack(ustack, 3);
3494                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3495                         mstate->dtms_ucaller = ustack[2];
3496                         mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3497                 }
3498
3499                 return (mstate->dtms_ucaller);
3500
3501         case DIF_VAR_PROBEPROV:
3502                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3503                 return (dtrace_dif_varstr(
3504                     (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3505                     state, mstate));
3506
3507         case DIF_VAR_PROBEMOD:
3508                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3509                 return (dtrace_dif_varstr(
3510                     (uintptr_t)mstate->dtms_probe->dtpr_mod,
3511                     state, mstate));
3512
3513         case DIF_VAR_PROBEFUNC:
3514                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3515                 return (dtrace_dif_varstr(
3516                     (uintptr_t)mstate->dtms_probe->dtpr_func,
3517                     state, mstate));
3518
3519         case DIF_VAR_PROBENAME:
3520                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3521                 return (dtrace_dif_varstr(
3522                     (uintptr_t)mstate->dtms_probe->dtpr_name,
3523                     state, mstate));
3524
3525         case DIF_VAR_PID:
3526                 if (!dtrace_priv_proc_relaxed(state))
3527                         return (0);
3528
3529                 /*
3530                  * Note that we are assuming that an unanchored probe is
3531                  * always due to a high-level interrupt.  (And we're assuming
3532                  * that there is only a single high level interrupt.)
3533                  */
3534                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3535                         /* Anchored probe that fires while on an interrupt accrues to process 0 */
3536                         return 0;
3537
3538                 return ((uint64_t)dtrace_proc_selfpid());
3539
3540         case DIF_VAR_PPID:
3541                 if (!dtrace_priv_proc_relaxed(state))
3542                         return (0);
3543
3544                 /*
3545                  * See comment in DIF_VAR_PID.
3546                  */
3547                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3548                         return (0);
3549
3550                 return ((uint64_t)dtrace_proc_selfppid());
3551
3552         case DIF_VAR_TID:
3553                 /* We do not need to check for null current_thread() */
3554                 return thread_tid(current_thread()); /* globally unique */
3555
3556         case DIF_VAR_PTHREAD_SELF:
3557                 if (!dtrace_priv_proc(state))
3558                         return (0);
3559
3560                 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3561                 return 0;
3562
3563         case DIF_VAR_DISPATCHQADDR:
3564                 if (!dtrace_priv_proc(state))
3565                         return (0);
3566
3567                 /* We do not need to check for null current_thread() */
3568                 return thread_dispatchqaddr(current_thread());
3569
3570         case DIF_VAR_EXECNAME:
3571         {
3572                 char *xname = (char *)mstate->dtms_scratch_ptr;
3573                 size_t scratch_size = MAXCOMLEN+1;
3574
3575                 /* The scratch allocation's lifetime is that of the clause. */
3576                 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3577                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3578                         return 0;
3579                 }
3580
3581                 if (!dtrace_priv_proc_relaxed(state))
3582                         return (0);
3583
3584                 mstate->dtms_scratch_ptr += scratch_size;
3585                 proc_selfname( xname, scratch_size );
3586
3587                 return ((uint64_t)(uintptr_t)xname);
3588         }
3589
3590
3591         case DIF_VAR_ZONENAME:
3592         {
3593                 /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3594                 char *zname = (char *)mstate->dtms_scratch_ptr;
3595                 size_t scratch_size = 6 + 1;
3596
3597                 if (!dtrace_priv_proc(state))
3598                         return (0);
3599
3600                 /* The scratch allocation's lifetime is that of the clause. */
3601                 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3602                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3603                         return 0;
3604                 }
3605
3606                 mstate->dtms_scratch_ptr += scratch_size;
3607
3608                 /* The kernel does not provide zonename, it will always return 'global'. */
3609                 strlcpy(zname, "global", scratch_size);
3610
3611                 return ((uint64_t)(uintptr_t)zname);
3612         }
3613
3614 #if MONOTONIC
3615         case DIF_VAR_CPUINSTRS:
3616                 return mt_cur_cpu_instrs();
3617
3618         case DIF_VAR_CPUCYCLES:
3619                 return mt_cur_cpu_cycles();
3620
3621         case DIF_VAR_VINSTRS:
3622                 return mt_cur_thread_instrs();
3623
3624         case DIF_VAR_VCYCLES:
3625                 return mt_cur_thread_cycles();
3626 #else /* MONOTONIC */
3627         case DIF_VAR_CPUINSTRS: /* FALLTHROUGH */
3628         case DIF_VAR_CPUCYCLES: /* FALLTHROUGH */
3629         case DIF_VAR_VINSTRS: /* FALLTHROUGH */
3630         case DIF_VAR_VCYCLES: /* FALLTHROUGH */
3631                 return 0;
3632 #endif /* !MONOTONIC */
3633
3634         case DIF_VAR_UID:
3635                 if (!dtrace_priv_proc_relaxed(state))
3636                         return (0);
3637
3638                 /*
3639                  * See comment in DIF_VAR_PID.
3640                  */
3641                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3642                         return (0);
3643
3644                 return ((uint64_t) dtrace_proc_selfruid());
3645
3646         case DIF_VAR_GID:
3647                 if (!dtrace_priv_proc(state))
3648                         return (0);
3649
3650                 /*
3651                  * See comment in DIF_VAR_PID.
3652                  */
3653                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3654                         return (0);
3655
3656                 if (dtrace_CRED() != NULL)
3657                         /* Credential does not require lazy initialization. */
3658                         return ((uint64_t)kauth_getgid());
3659                 else {
3660                         /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3661                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3662                         return -1ULL;
3663                 }
3664
3665         case DIF_VAR_ERRNO: {
3666                 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3667                 if (!dtrace_priv_proc(state))
3668                         return (0);
3669
3670                 /*
3671                  * See comment in DIF_VAR_PID.
3672                  */
3673                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3674                         return (0);
3675
3676                 if (uthread)
3677                         return (uint64_t)uthread->t_dtrace_errno;
3678                 else {
3679                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3680                         return -1ULL;
3681                 }
3682         }
3683
3684         default:
3685                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3686                 return (0);
3687         }
3688 }
3689
3690 typedef enum dtrace_json_state {
3691         DTRACE_JSON_REST = 1,
3692         DTRACE_JSON_OBJECT,
3693         DTRACE_JSON_STRING,
3694         DTRACE_JSON_STRING_ESCAPE,
3695         DTRACE_JSON_STRING_ESCAPE_UNICODE,
3696         DTRACE_JSON_COLON,
3697         DTRACE_JSON_COMMA,
3698         DTRACE_JSON_VALUE,
3699         DTRACE_JSON_IDENTIFIER,
3700         DTRACE_JSON_NUMBER,
3701         DTRACE_JSON_NUMBER_FRAC,
3702         DTRACE_JSON_NUMBER_EXP,
3703         DTRACE_JSON_COLLECT_OBJECT
3704 } dtrace_json_state_t;
3705
3706 /*
3707  * This function possesses just enough knowledge about JSON to extract a single
3708  * value from a JSON string and store it in the scratch buffer.  It is able
3709  * to extract nested object values, and members of arrays by index.
3710  *
3711  * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3712  * be looked up as we descend into the object tree.  e.g.
3713  *
3714  *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3715  *       with nelems = 5.
3716  *
3717  * The run time of this function must be bounded above by strsize to limit the
3718  * amount of work done in probe context.  As such, it is implemented as a
3719  * simple state machine, reading one character at a time using safe loads
3720  * until we find the requested element, hit a parsing error or run off the
3721  * end of the object or string.
3722  *
3723  * As there is no way for a subroutine to return an error without interrupting
3724  * clause execution, we simply return NULL in the event of a missing key or any
3725  * other error condition.  Each NULL return in this function is commented with
3726  * the error condition it represents -- parsing or otherwise.
3727  *
3728  * The set of states for the state machine closely matches the JSON
3729  * specification (http://json.org/).  Briefly:
3730  *
3731  *   DTRACE_JSON_REST:
3732  *     Skip whitespace until we find either a top-level Object, moving
3733  *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3734  *
3735  *   DTRACE_JSON_OBJECT:
3736  *     Locate the next key String in an Object.  Sets a flag to denote
3737  *     the next String as a key string and moves to DTRACE_JSON_STRING.
3738  *
3739  *   DTRACE_JSON_COLON:
3740  *     Skip whitespace until we find the colon that separates key Strings
3741  *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3742  *
3743  *   DTRACE_JSON_VALUE:
3744  *     Detects the type of the next value (String, Number, Identifier, Object
3745  *     or Array) and routes to the states that process that type.  Here we also
3746  *     deal with the element selector list if we are requested to traverse down
3747  *     into the object tree.
3748  *
3749  *   DTRACE_JSON_COMMA:
3750  *     Skip whitespace until we find the comma that separates key-value pairs
3751  *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3752  *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
3753  *     states return to this state at the end of their value, unless otherwise
3754  *     noted.
3755  *
3756  *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3757  *     Processes a Number literal from the JSON, including any exponent
3758  *     component that may be present.  Numbers are returned as strings, which
3759  *     may be passed to strtoll() if an integer is required.
3760  *
3761  *   DTRACE_JSON_IDENTIFIER:
3762  *     Processes a "true", "false" or "null" literal in the JSON.
3763  *
3764  *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3765  *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
3766  *     Processes a String literal from the JSON, whether the String denotes
3767  *     a key, a value or part of a larger Object.  Handles all escape sequences
3768  *     present in the specification, including four-digit unicode characters,
3769  *     but merely includes the escape sequence without converting it to the
3770  *     actual escaped character.  If the String is flagged as a key, we
3771  *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3772  *
3773  *   DTRACE_JSON_COLLECT_OBJECT:
3774  *     This state collects an entire Object (or Array), correctly handling
3775  *     embedded strings.  If the full element selector list matches this nested
3776  *     object, we return the Object in full as a string.  If not, we use this
3777  *     state to skip to the next value at this level and continue processing.
3778  */
3779 static char *
3780 dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3781     char *dest)
3782 {
3783         dtrace_json_state_t state = DTRACE_JSON_REST;
3784         int64_t array_elem = INT64_MIN;
3785         int64_t array_pos = 0;
3786         uint8_t escape_unicount = 0;
3787         boolean_t string_is_key = B_FALSE;
3788         boolean_t collect_object = B_FALSE;
3789         boolean_t found_key = B_FALSE;
3790         boolean_t in_array = B_FALSE;
3791         uint32_t braces = 0, brackets = 0;
3792         char *elem = elemlist;
3793         char *dd = dest;
3794         uintptr_t cur;
3795
3796         for (cur = json; cur < json + size; cur++) {
3797                 char cc = dtrace_load8(cur);
3798                 if (cc == '\0')
3799                         return (NULL);
3800
3801                 switch (state) {
3802                 case DTRACE_JSON_REST:
3803                         if (isspace(cc))
3804                                 break;
3805
3806                         if (cc == '{') {
3807                                 state = DTRACE_JSON_OBJECT;
3808                                 break;
3809                         }
3810
3811                         if (cc == '[') {
3812                                 in_array = B_TRUE;
3813                                 array_pos = 0;
3814                                 array_elem = dtrace_strtoll(elem, 10, size);
3815                                 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3816                                 state = DTRACE_JSON_VALUE;
3817                                 break;
3818                         }
3819
3820                         /*
3821                          * ERROR: expected to find a top-level object or array.
3822                          */
3823                         return (NULL);
3824                 case DTRACE_JSON_OBJECT:
3825                         if (isspace(cc))
3826                                 break;
3827
3828                         if (cc == '"') {
3829                                 state = DTRACE_JSON_STRING;
3830                                 string_is_key = B_TRUE;
3831                                 break;
3832                         }
3833
3834                         /*
3835                          * ERROR: either the object did not start with a key
3836                          * string, or we've run off the end of the object
3837                          * without finding the requested key.
3838                          */
3839                         return (NULL);
3840                 case DTRACE_JSON_STRING:
3841                         if (cc == '\\') {
3842                                 *dd++ = '\\';
3843                                 state = DTRACE_JSON_STRING_ESCAPE;
3844                                 break;
3845                         }
3846
3847                         if (cc == '"') {
3848                                 if (collect_object) {
3849                                         /*
3850                                          * We don't reset the dest here, as
3851                                          * the string is part of a larger
3852                                          * object being collected.
3853                                          */
3854                                         *dd++ = cc;
3855                                         collect_object = B_FALSE;
3856                                         state = DTRACE_JSON_COLLECT_OBJECT;
3857                                         break;
3858                                 }
3859                                 *dd = '\0';
3860                                 dd = dest; /* reset string buffer */
3861                                 if (string_is_key) {
3862                                         if (dtrace_strncmp(dest, elem,
3863                                             size) == 0)
3864                                                 found_key = B_TRUE;
3865                                 } else if (found_key) {
3866                                         if (nelems > 1) {
3867                                                 /*
3868                                                  * We expected an object, not
3869                                                  * this string.
3870                                                  */
3871                                                 return (NULL);
3872                                         }
3873                                         return (dest);
3874                                 }
3875                                 state = string_is_key ? DTRACE_JSON_COLON :
3876                                     DTRACE_JSON_COMMA;
3877                                 string_is_key = B_FALSE;
3878                                 break;
3879                         }
3880
3881                         *dd++ = cc;
3882                         break;
3883                 case DTRACE_JSON_STRING_ESCAPE:
3884                         *dd++ = cc;
3885                         if (cc == 'u') {
3886                                 escape_unicount = 0;
3887                                 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3888                         } else {
3889                                 state = DTRACE_JSON_STRING;
3890                         }
3891                         break;
3892                 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3893                         if (!isxdigit(cc)) {
3894                                 /*
3895                                  * ERROR: invalid unicode escape, expected
3896                                  * four valid hexidecimal digits.
3897                                  */
3898                                 return (NULL);
3899                         }
3900
3901                         *dd++ = cc;
3902                         if (++escape_unicount == 4)
3903                                 state = DTRACE_JSON_STRING;
3904                         break;
3905                 case DTRACE_JSON_COLON:
3906                         if (isspace(cc))
3907                                 break;
3908
3909                         if (cc == ':') {
3910                                 state = DTRACE_JSON_VALUE;
3911                                 break;
3912                         }
3913
3914                         /*
3915                          * ERROR: expected a colon.
3916                          */
3917                         return (NULL);
3918                 case DTRACE_JSON_COMMA:
3919                         if (isspace(cc))
3920                                 break;
3921
3922                         if (cc == ',') {
3923                                 if (in_array) {
3924                                         state = DTRACE_JSON_VALUE;
3925                                         if (++array_pos == array_elem)
3926                                                 found_key = B_TRUE;
3927                                 } else {
3928                                         state = DTRACE_JSON_OBJECT;
3929                                 }
3930                                 break;
3931                         }
3932
3933                         /*
3934                          * ERROR: either we hit an unexpected character, or
3935                          * we reached the end of the object or array without
3936                          * finding the requested key.
3937                          */
3938                         return (NULL);
3939                 case DTRACE_JSON_IDENTIFIER:
3940                         if (islower(cc)) {
3941                                 *dd++ = cc;
3942                                 break;
3943                         }
3944
3945                         *dd = '\0';
3946                         dd = dest; /* reset string buffer */
3947
3948                         if (dtrace_strncmp(dest, "true", 5) == 0 ||
3949                             dtrace_strncmp(dest, "false", 6) == 0 ||
3950                             dtrace_strncmp(dest, "null", 5) == 0) {
3951                                 if (found_key) {
3952                                         if (nelems > 1) {
3953                                                 /*
3954                                                  * ERROR: We expected an object,
3955                                                  * not this identifier.
3956                                                  */
3957                                                 return (NULL);
3958                                         }
3959                                         return (dest);
3960                                 } else {
3961                                         cur--;
3962                                         state = DTRACE_JSON_COMMA;
3963                                         break;
3964                                 }
3965                         }
3966
3967                         /*
3968                          * ERROR: we did not recognise the identifier as one
3969                          * of those in the JSON specification.
3970                          */
3971                         return (NULL);
3972                 case DTRACE_JSON_NUMBER:
3973                         if (cc == '.') {
3974                                 *dd++ = cc;
3975                                 state = DTRACE_JSON_NUMBER_FRAC;
3976                                 break;
3977                         }
3978
3979                         if (cc == 'x' || cc == 'X') {
3980                                 /*
3981                                  * ERROR: specification explicitly excludes
3982                                  * hexidecimal or octal numbers.
3983                                  */
3984                                 return (NULL);
3985                         }
3986
3987                         /* FALLTHRU */
3988                 case DTRACE_JSON_NUMBER_FRAC:
3989                         if (cc == 'e' || cc == 'E') {
3990                                 *dd++ = cc;
3991                                 state = DTRACE_JSON_NUMBER_EXP;
3992                                 break;
3993                         }
3994
3995                         if (cc == '+' || cc == '-') {
3996                                 /*
3997                                  * ERROR: expect sign as part of exponent only.
3998                                  */
3999                                 return (NULL);
4000                         }
4001                         /* FALLTHRU */
4002                 case DTRACE_JSON_NUMBER_EXP:
4003                         if (isdigit(cc) || cc == '+' || cc == '-') {
4004                                 *dd++ = cc;
4005                                 break;
4006                         }
4007
4008                         *dd = '\0';
4009                         dd = dest; /* reset string buffer */
4010                         if (found_key) {
4011                                 if (nelems > 1) {
4012                                         /*
4013                                          * ERROR: We expected an object, not
4014                                          * this number.
4015                                          */
4016                                         return (NULL);
4017                                 }
4018                                 return (dest);
4019                         }
4020
4021                         cur--;
4022                         state = DTRACE_JSON_COMMA;
4023                         break;
4024                 case DTRACE_JSON_VALUE:
4025                         if (isspace(cc))
4026                                 break;
4027
4028                         if (cc == '{' || cc == '[') {
4029                                 if (nelems > 1 && found_key) {
4030                                         in_array = cc == '[' ? B_TRUE : B_FALSE;
4031                                         /*
4032                                          * If our element selector directs us
4033                                          * to descend into this nested object,
4034                                          * then move to the next selector
4035                                          * element in the list and restart the
4036                                          * state machine.
4037                                          */
4038                                         while (*elem != '\0')
4039                                                 elem++;
4040                                         elem++; /* skip the inter-element NUL */
4041                                         nelems--;
4042                                         dd = dest;
4043                                         if (in_array) {
4044                                                 state = DTRACE_JSON_VALUE;
4045                                                 array_pos = 0;
4046                                                 array_elem = dtrace_strtoll(
4047                                                     elem, 10, size);
4048                                                 found_key = array_elem == 0 ?
4049                                                     B_TRUE : B_FALSE;
4050                                         } else {
4051                                                 found_key = B_FALSE;
4052                                                 state = DTRACE_JSON_OBJECT;
4053                                         }
4054                                         break;
4055                                 }
4056
4057                                 /*
4058                                  * Otherwise, we wish to either skip this
4059                                  * nested object or return it in full.
4060                                  */
4061                                 if (cc == '[')
4062                                         brackets = 1;
4063                                 else
4064                                         braces = 1;
4065                                 *dd++ = cc;
4066                                 state = DTRACE_JSON_COLLECT_OBJECT;
4067                                 break;
4068                         }
4069
4070                         if (cc == '"') {
4071                                 state = DTRACE_JSON_STRING;
4072                                 break;
4073                         }
4074
4075                         if (islower(cc)) {
4076                                 /*
4077                                  * Here we deal with true, false and null.
4078                                  */
4079                                 *dd++ = cc;
4080                                 state = DTRACE_JSON_IDENTIFIER;
4081                                 break;
4082                         }
4083
4084                         if (cc == '-' || isdigit(cc)) {
4085                                 *dd++ = cc;
4086                                 state = DTRACE_JSON_NUMBER;
4087                                 break;
4088                         }
4089
4090                         /*
4091                          * ERROR: unexpected character at start of value.
4092                          */
4093                         return (NULL);
4094                 case DTRACE_JSON_COLLECT_OBJECT:
4095                         if (cc == '\0')
4096                                 /*
4097                                  * ERROR: unexpected end of input.
4098                                  */
4099                                 return (NULL);
4100
4101                         *dd++ = cc;
4102                         if (cc == '"') {
4103                                 collect_object = B_TRUE;
4104                                 state = DTRACE_JSON_STRING;
4105                                 break;
4106                         }
4107
4108                         if (cc == ']') {
4109                                 if (brackets-- == 0) {
4110                                         /*
4111                                          * ERROR: unbalanced brackets.
4112                                          */
4113                                         return (NULL);
4114                                 }
4115                         } else if (cc == '}') {
4116                                 if (braces-- == 0) {
4117                                         /*
4118                                          * ERROR: unbalanced braces.
4119                                          */
4120                                         return (NULL);
4121                                 }
4122                         } else if (cc == '{') {
4123                                 braces++;
4124                         } else if (cc == '[') {
4125                                 brackets++;
4126                         }
4127
4128                         if (brackets == 0 && braces == 0) {
4129                                 if (found_key) {
4130                                         *dd = '\0';
4131                                         return (dest);
4132                                 }
4133                                 dd = dest; /* reset string buffer */
4134                                 state = DTRACE_JSON_COMMA;
4135                         }
4136                         break;
4137                 }
4138         }
4139         return (NULL);
4140 }
4141
4142 /*
4143  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4144  * Notice that we don't bother validating the proper number of arguments or
4145  * their types in the tuple stack.  This isn't needed because all argument
4146  * interpretation is safe because of our load safety -- the worst that can
4147  * happen is that a bogus program can obtain bogus results.
4148  */
4149 static void
4150 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4151     dtrace_key_t *tupregs, int nargs,
4152     dtrace_mstate_t *mstate, dtrace_state_t *state)
4153 {
4154         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4155         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4156         dtrace_vstate_t *vstate = &state->dts_vstate;
4157
4158 #if !defined(__APPLE__)
4159         union {
4160                 mutex_impl_t mi;
4161                 uint64_t mx;
4162         } m;
4163
4164         union {
4165                 krwlock_t ri;
4166                 uintptr_t rw;
4167         } r;
4168 #else
4169 /* FIXME: awaits lock/mutex work */
4170 #endif /* __APPLE__ */
4171
4172         switch (subr) {
4173         case DIF_SUBR_RAND:
4174                 regs[rd] = dtrace_xoroshiro128_plus_next(
4175                     state->dts_rstate[CPU->cpu_id]);
4176                 break;
4177
4178 #if !defined(__APPLE__)
4179         case DIF_SUBR_MUTEX_OWNED:
4180                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4181                     mstate, vstate)) {
4182                         regs[rd] = 0;
4183                         break;
4184                 }
4185
4186                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4187                 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4188                         regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4189                 else
4190                         regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4191                 break;
4192
4193         case DIF_SUBR_MUTEX_OWNER:
4194                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4195                     mstate, vstate)) {
4196                         regs[rd] = 0;
4197                         break;
4198                 }
4199
4200                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4201                 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4202                     MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4203                         regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4204                 else
4205                         regs[rd] = 0;
4206                 break;
4207
4208         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4209                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4210                     mstate, vstate)) {
4211                         regs[rd] = 0;
4212                         break;
4213                 }
4214
4215                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4216                 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4217                 break;
4218
4219         case DIF_SUBR_MUTEX_TYPE_SPIN:
4220                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4221                     mstate, vstate)) {
4222                         regs[rd] = 0;
4223                         break;
4224                 }
4225
4226                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4227                 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4228                 break;
4229
4230         case DIF_SUBR_RW_READ_HELD: {
4231                 uintptr_t tmp;
4232
4233                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4234                     mstate, vstate)) {
4235                         regs[rd] = 0;
4236                         break;
4237                 }
4238
4239                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4240                 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4241                 break;
4242         }
4243
4244         case DIF_SUBR_RW_WRITE_HELD:
4245                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4246                     mstate, vstate)) {
4247                         regs[rd] = 0;
4248                         break;
4249                 }
4250
4251                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4252                 regs[rd] = _RW_WRITE_HELD(&r.ri);
4253                 break;
4254
4255         case DIF_SUBR_RW_ISWRITER:
4256                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4257                     mstate, vstate)) {
4258                         regs[rd] = 0;
4259                         break;
4260                 }
4261
4262                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4263                 regs[rd] = _RW_ISWRITER(&r.ri);
4264                 break;
4265 #else
4266 /* FIXME: awaits lock/mutex work */
4267 #endif /* __APPLE__ */
4268
4269         case DIF_SUBR_BCOPY: {
4270                 /*
4271                  * We need to be sure that the destination is in the scratch
4272                  * region -- no other region is allowed.
4273                  */
4274                 uintptr_t src = tupregs[0].dttk_value;
4275                 uintptr_t dest = tupregs[1].dttk_value;
4276                 size_t size = tupregs[2].dttk_value;
4277
4278                 if (!dtrace_inscratch(dest, size, mstate)) {
4279                         *flags |= CPU_DTRACE_BADADDR;
4280                         *illval = regs[rd];
4281                         break;
4282                 }
4283
4284                 if (!dtrace_canload(src, size, mstate, vstate)) {
4285                         regs[rd] = 0;
4286                         break;
4287                 }
4288
4289                 dtrace_bcopy((void *)src, (void *)dest, size);
4290                 break;
4291         }
4292
4293         case DIF_SUBR_ALLOCA:
4294         case DIF_SUBR_COPYIN: {
4295                 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4296                 uint64_t size =
4297                     tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4298                 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4299
4300                 /*
4301                  * Check whether the user can access kernel memory
4302                  */
4303                 if (dtrace_priv_kernel(state) == 0) {
4304                         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
4305                         regs[rd] = 0;
4306                         break;
4307                 }
4308                 /*
4309                  * This action doesn't require any credential checks since
4310                  * probes will not activate in user contexts to which the
4311                  * enabling user does not have permissions.
4312                  */
4313
4314                 /*
4315                  * Rounding up the user allocation size could have overflowed
4316                  * a large, bogus allocation (like -1ULL) to 0.
4317                  */
4318                 if (scratch_size < size ||
4319                     !DTRACE_INSCRATCH(mstate, scratch_size)) {
4320                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4321                         regs[rd] = 0;
4322                         break;
4323                 }
4324
4325                 if (subr == DIF_SUBR_COPYIN) {
4326                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4327                         if (dtrace_priv_proc(state))
4328                                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4329                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4330                 }
4331
4332                 mstate->dtms_scratch_ptr += scratch_size;
4333                 regs[rd] = dest;
4334                 break;
4335         }
4336
4337         case DIF_SUBR_COPYINTO: {
4338                 uint64_t size = tupregs[1].dttk_value;
4339                 uintptr_t dest = tupregs[2].dttk_value;
4340
4341                 /*
4342                  * This action doesn't require any credential checks since
4343                  * probes will not activate in user contexts to which the
4344                  * enabling user does not have permissions.
4345                  */
4346                 if (!dtrace_inscratch(dest, size, mstate)) {
4347                         *flags |= CPU_DTRACE_BADADDR;
4348                         *illval = regs[rd];
4349                         break;
4350                 }
4351
4352                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4353                 if (dtrace_priv_proc(state))
4354                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4355                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4356                 break;
4357         }
4358
4359         case DIF_SUBR_COPYINSTR: {
4360                 uintptr_t dest = mstate->dtms_scratch_ptr;
4361                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4362
4363                 if (nargs > 1 && tupregs[1].dttk_value < size)
4364                         size = tupregs[1].dttk_value + 1;
4365
4366                 /*
4367                  * This action doesn't require any credential checks since
4368                  * probes will not activate in user contexts to which the
4369                  * enabling user does not have permissions.
4370                  */
4371                 if (!DTRACE_INSCRATCH(mstate, size)) {
4372                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4373                         regs[rd] = 0;
4374                         break;
4375                 }
4376
4377                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4378                 if (dtrace_priv_proc(state))
4379                         dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4380                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4381
4382                 ((char *)dest)[size - 1] = '\0';
4383                 mstate->dtms_scratch_ptr += size;
4384                 regs[rd] = dest;
4385                 break;
4386         }
4387
4388         case DIF_SUBR_MSGSIZE:
4389         case DIF_SUBR_MSGDSIZE: {
4390                 /* Darwin does not implement SysV streams messages */
4391                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4392                 regs[rd] = 0;
4393                 break;
4394         }
4395
4396         case DIF_SUBR_PROGENYOF: {
4397                 pid_t pid = tupregs[0].dttk_value;
4398                 struct proc *p = current_proc();
4399                 int rval = 0, lim = nprocs;
4400
4401                 while(p && (lim-- > 0)) {
4402                         pid_t ppid;
4403
4404                         ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
4405                         if (*flags & CPU_DTRACE_FAULT)
4406                                 break;
4407
4408                         if (ppid == pid) {
4409                                 rval = 1;
4410                                 break;
4411                         }
4412
4413                         if (ppid == 0)
4414                                 break; /* Can't climb process tree any further. */
4415
4416                         p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
4417                         if (*flags & CPU_DTRACE_FAULT)
4418                                 break;
4419                 }
4420
4421                 regs[rd] = rval;
4422                 break;
4423         }
4424
4425         case DIF_SUBR_SPECULATION:
4426                 regs[rd] = dtrace_speculation(state);
4427                 break;
4428
4429
4430         case DIF_SUBR_COPYOUT: {
4431                 uintptr_t kaddr = tupregs[0].dttk_value;
4432                 user_addr_t uaddr = tupregs[1].dttk_value;
4433                 uint64_t size = tupregs[2].dttk_value;
4434
4435                 if (!dtrace_destructive_disallow &&
4436                     dtrace_priv_proc_control(state) &&
4437                     !dtrace_istoxic(kaddr, size) &&
4438                     dtrace_canload(kaddr, size, mstate, vstate)) {
4439                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4440                         dtrace_copyout(kaddr, uaddr, size, flags);
4441                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4442                 }
4443                 break;
4444         }
4445
4446         case DIF_SUBR_COPYOUTSTR: {
4447                 uintptr_t kaddr = tupregs[0].dttk_value;
4448                 user_addr_t uaddr = tupregs[1].dttk_value;
4449                 uint64_t size = tupregs[2].dttk_value;
4450                 size_t lim;
4451
4452                 if (!dtrace_destructive_disallow &&
4453                     dtrace_priv_proc_control(state) &&
4454                     !dtrace_istoxic(kaddr, size) &&
4455                     dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
4456                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4457                         dtrace_copyoutstr(kaddr, uaddr, lim, flags);
4458                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4459                 }
4460                 break;
4461         }
4462
4463         case DIF_SUBR_STRLEN: {
4464                 size_t size = state->dts_options[DTRACEOPT_STRSIZE];
4465                 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4466                 size_t lim;
4467
4468                 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4469                         regs[rd] = 0;
4470                         break;
4471                 }
4472
4473                 regs[rd] = dtrace_strlen((char *)addr, lim);
4474
4475                 break;
4476         }
4477
4478         case DIF_SUBR_STRCHR:
4479         case DIF_SUBR_STRRCHR: {
4480                 /*
4481                  * We're going to iterate over the string looking for the
4482                  * specified character.  We will iterate until we have reached
4483                  * the string length or we have found the character.  If this
4484                  * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4485                  * of the specified character instead of the first.
4486                  */
4487                 uintptr_t addr = tupregs[0].dttk_value;
4488                 uintptr_t addr_limit;
4489                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4490                 size_t lim;
4491                 char c, target = (char)tupregs[1].dttk_value;
4492
4493                 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4494                         regs[rd] = 0;
4495                         break;
4496                 }
4497                 addr_limit = addr + lim;
4498
4499                 for (regs[rd] = 0; addr < addr_limit; addr++) {
4500                         if ((c = dtrace_load8(addr)) == target) {
4501                                 regs[rd] = addr;
4502
4503                                 if (subr == DIF_SUBR_STRCHR)
4504                                         break;
4505                         }
4506
4507                         if (c == '\0')
4508                                 break;
4509                 }
4510
4511                 break;
4512         }
4513
4514         case DIF_SUBR_STRSTR:
4515         case DIF_SUBR_INDEX:
4516         case DIF_SUBR_RINDEX: {
4517                 /*
4518                  * We're going to iterate over the string looking for the
4519                  * specified string.  We will iterate until we have reached
4520                  * the string length or we have found the string.  (Yes, this
4521                  * is done in the most naive way possible -- but considering
4522                  * that the string we're searching for is likely to be
4523                  * relatively short, the complexity of Rabin-Karp or similar
4524                  * hardly seems merited.)
4525                  */
4526                 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4527                 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4528                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4529                 size_t len = dtrace_strlen(addr, size);
4530                 size_t sublen = dtrace_strlen(substr, size);
4531                 char *limit = addr + len, *orig = addr;
4532                 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4533                 int inc = 1;
4534
4535                 regs[rd] = notfound;
4536
4537                 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4538                         regs[rd] = 0;
4539                         break;
4540                 }
4541
4542                 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4543                     vstate)) {
4544                         regs[rd] = 0;
4545                         break;
4546                 }
4547
4548                 /*
4549                  * strstr() and index()/rindex() have similar semantics if
4550                  * both strings are the empty string: strstr() returns a
4551                  * pointer to the (empty) string, and index() and rindex()
4552                  * both return index 0 (regardless of any position argument).
4553                  */
4554                 if (sublen == 0 && len == 0) {
4555                         if (subr == DIF_SUBR_STRSTR)
4556                                 regs[rd] = (uintptr_t)addr;
4557                         else
4558                                 regs[rd] = 0;
4559                         break;
4560                 }
4561
4562                 if (subr != DIF_SUBR_STRSTR) {
4563                         if (subr == DIF_SUBR_RINDEX) {
4564                                 limit = orig - 1;
4565                                 addr += len;
4566                                 inc = -1;
4567                         }
4568
4569                         /*
4570                          * Both index() and rindex() take an optional position
4571                          * argument that denotes the starting position.
4572                          */
4573                         if (nargs == 3) {
4574                                 int64_t pos = (int64_t)tupregs[2].dttk_value;
4575
4576                                 /*
4577                                  * If the position argument to index() is
4578                                  * negative, Perl implicitly clamps it at
4579                                  * zero.  This semantic is a little surprising
4580                                  * given the special meaning of negative
4581                                  * positions to similar Perl functions like
4582                                  * substr(), but it appears to reflect a
4583                                  * notion that index() can start from a
4584                                  * negative index and increment its way up to
4585                                  * the string.  Given this notion, Perl's
4586                                  * rindex() is at least self-consistent in
4587                                  * that it implicitly clamps positions greater
4588                                  * than the string length to be the string
4589                                  * length.  Where Perl completely loses
4590                                  * coherence, however, is when the specified
4591                                  * substring is the empty string ("").  In
4592                                  * this case, even if the position is
4593                                  * negative, rindex() returns 0 -- and even if
4594                                  * the position is greater than the length,
4595                                  * index() returns the string length.  These
4596                                  * semantics violate the notion that index()
4597                                  * should never return a value less than the
4598                                  * specified position and that rindex() should
4599                                  * never return a value greater than the
4600                                  * specified position.  (One assumes that
4601                                  * these semantics are artifacts of Perl's
4602                                  * implementation and not the results of
4603                                  * deliberate design -- it beggars belief that
4604                                  * even Larry Wall could desire such oddness.)
4605                                  * While in the abstract one would wish for
4606                                  * consistent position semantics across
4607                                  * substr(), index() and rindex() -- or at the
4608                                  * very least self-consistent position
4609                                  * semantics for index() and rindex() -- we
4610                                  * instead opt to keep with the extant Perl
4611                                  * semantics, in all their broken glory.  (Do
4612                                  * we have more desire to maintain Perl's
4613                                  * semantics than Perl does?  Probably.)
4614                                  */
4615                                 if (subr == DIF_SUBR_RINDEX) {
4616                                         if (pos < 0) {
4617                                                 if (sublen == 0)
4618                                                         regs[rd] = 0;
4619                                                 break;
4620                                         }
4621
4622                                         if ((size_t)pos > len)
4623                                                 pos = len;
4624                                 } else {
4625                                         if (pos < 0)
4626                                                 pos = 0;
4627
4628                                         if ((size_t)pos >= len) {
4629                                                 if (sublen == 0)
4630                                                         regs[rd] = len;
4631                                                 break;
4632                                         }
4633                                 }
4634
4635                                 addr = orig + pos;
4636                         }
4637                 }
4638
4639                 for (regs[rd] = notfound; addr != limit; addr += inc) {
4640                         if (dtrace_strncmp(addr, substr, sublen) == 0) {
4641                                 if (subr != DIF_SUBR_STRSTR) {
4642                                         /*
4643                                          * As D index() and rindex() are
4644                                          * modeled on Perl (and not on awk),
4645                                          * we return a zero-based (and not a
4646                                          * one-based) index.  (For you Perl
4647                                          * weenies: no, we're not going to add
4648                                          * $[ -- and shouldn't you be at a con
4649                                          * or something?)
4650                                          */
4651                                         regs[rd] = (uintptr_t)(addr - orig);
4652                                         break;
4653                                 }
4654
4655                                 ASSERT(subr == DIF_SUBR_STRSTR);
4656                                 regs[rd] = (uintptr_t)addr;
4657                                 break;
4658                         }
4659                 }
4660
4661                 break;
4662         }
4663
4664         case DIF_SUBR_STRTOK: {
4665                 uintptr_t addr = tupregs[0].dttk_value;
4666                 uintptr_t tokaddr = tupregs[1].dttk_value;
4667                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4668                 uintptr_t limit, toklimit;
4669                 size_t clim;
4670                 char *dest = (char *)mstate->dtms_scratch_ptr;
4671                 uint8_t c='\0', tokmap[32];      /* 256 / 8 */
4672                 uint64_t i = 0;
4673
4674                 /*
4675                  * Check both the token buffer and (later) the input buffer,
4676                  * since both could be non-scratch addresses.
4677                  */
4678                 if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4679                         regs[rd] = 0;
4680                         break;
4681                 }
4682                 toklimit = tokaddr + clim;
4683
4684                 if (!DTRACE_INSCRATCH(mstate, size)) {
4685                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4686                         regs[rd] = 0;
4687                         break;
4688                 }
4689
4690                 if (addr == 0) {
4691                         /*
4692                          * If the address specified is NULL, we use our saved
4693                          * strtok pointer from the mstate.  Note that this
4694                          * means that the saved strtok pointer is _only_
4695                          * valid within multiple enablings of the same probe --
4696                          * it behaves like an implicit clause-local variable.
4697                          */
4698                         addr = mstate->dtms_strtok;
4699                         limit = mstate->dtms_strtok_limit;
4700                 } else {
4701                         /*
4702                          * If the user-specified address is non-NULL we must
4703                          * access check it.  This is the only time we have
4704                          * a chance to do so, since this address may reside
4705                          * in the string table of this clause-- future calls
4706                          * (when we fetch addr from mstate->dtms_strtok)
4707                          * would fail this access check.
4708                          */
4709                         if (!dtrace_strcanload(addr, size, &clim, mstate,
4710                                 vstate)) {
4711                                 regs[rd] = 0;
4712                                 break;
4713                         }
4714                         limit = addr + clim;
4715                 }
4716
4717                 /*
4718                  * First, zero the token map, and then process the token
4719                  * string -- setting a bit in the map for every character
4720                  * found in the token string.
4721                  */
4722                 for (i = 0; i < (int)sizeof (tokmap); i++)
4723                         tokmap[i] = 0;
4724
4725                 for (; tokaddr < toklimit; tokaddr++) {
4726                         if ((c = dtrace_load8(tokaddr)) == '\0')
4727                                 break;
4728
4729                         ASSERT((c >> 3) < sizeof (tokmap));
4730                         tokmap[c >> 3] |= (1 << (c & 0x7));
4731                 }
4732
4733                 for (; addr < limit; addr++) {
4734                         /*
4735                          * We're looking for a character that is _not_
4736                          * contained in the token string.
4737                          */
4738                         if ((c = dtrace_load8(addr)) == '\0')
4739                                 break;
4740
4741                         if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4742                                 break;
4743                 }
4744
4745                 if (c == '\0') {
4746                         /*
4747                          * We reached the end of the string without finding
4748                          * any character that was not in the token string.
4749                          * We return NULL in this case, and we set the saved
4750                          * address to NULL as well.
4751                          */
4752                         regs[rd] = 0;
4753                         mstate->dtms_strtok = 0;
4754                         mstate->dtms_strtok_limit = 0;
4755                         break;
4756                 }
4757
4758                 /*
4759                  * From here on, we're copying into the destination string.
4760                  */
4761                 for (i = 0; addr < limit && i < size - 1; addr++) {
4762                         if ((c = dtrace_load8(addr)) == '\0')
4763                                 break;
4764
4765                         if (tokmap[c >> 3] & (1 << (c & 0x7)))
4766                                 break;
4767
4768                         ASSERT(i < size);
4769                         dest[i++] = c;
4770                 }
4771
4772                 ASSERT(i < size);
4773                 dest[i] = '\0';
4774                 regs[rd] = (uintptr_t)dest;
4775                 mstate->dtms_scratch_ptr += size;
4776                 mstate->dtms_strtok = addr;
4777                 mstate->dtms_strtok_limit = limit;
4778                 break;
4779         }
4780
4781         case DIF_SUBR_SUBSTR: {
4782                 uintptr_t s = tupregs[0].dttk_value;
4783                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4784                 char *d = (char *)mstate->dtms_scratch_ptr;
4785                 int64_t index = (int64_t)tupregs[1].dttk_value;
4786                 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4787                 size_t len = dtrace_strlen((char *)s, size);
4788                 int64_t i = 0;
4789
4790                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4791                         regs[rd] = 0;
4792                         break;
4793                 }
4794
4795                 if (!DTRACE_INSCRATCH(mstate, size)) {
4796                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4797                         regs[rd] = 0;
4798                         break;
4799                 }
4800
4801                 if (nargs <= 2)
4802                         remaining = (int64_t)size;
4803
4804                 if (index < 0) {
4805                         index += len;
4806
4807                         if (index < 0 && index + remaining > 0) {
4808                                 remaining += index;
4809                                 index = 0;
4810                         }
4811                 }
4812
4813                 if ((size_t)index >= len || index < 0) {
4814                         remaining = 0;
4815                 } else if (remaining < 0) {
4816                         remaining += len - index;
4817                 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4818                         remaining = size - index;
4819                 }
4820
4821                 for (i = 0; i < remaining; i++) {
4822                         if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4823                                 break;
4824                         }
4825
4826                 d[i] = '\0';
4827
4828                 mstate->dtms_scratch_ptr += size;
4829                 regs[rd] = (uintptr_t)d;
4830                 break;
4831         }
4832
4833         case DIF_SUBR_GETMAJOR:
4834                 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4835                 break;
4836
4837         case DIF_SUBR_GETMINOR:
4838                 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4839                 break;
4840
4841         case DIF_SUBR_DDI_PATHNAME: {
4842                 /* APPLE NOTE: currently unsupported on Darwin */
4843                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4844                 regs[rd] = 0;
4845                 break;
4846         }
4847
4848         case DIF_SUBR_STRJOIN: {
4849                 char *d = (char *)mstate->dtms_scratch_ptr;
4850                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4851                 uintptr_t s1 = tupregs[0].dttk_value;
4852                 uintptr_t s2 = tupregs[1].dttk_value;
4853                 uint64_t i = 0, j = 0;
4854                 size_t lim1, lim2;
4855                 char c;
4856
4857                 if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
4858                     !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
4859                         regs[rd] = 0;
4860                         break;
4861                 }
4862
4863                 if (!DTRACE_INSCRATCH(mstate, size)) {
4864                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4865                         regs[rd] = 0;
4866                         break;
4867                 }
4868
4869                 for (;;) {
4870                         if (i >= size) {
4871                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4872                                 regs[rd] = 0;
4873                                 break;
4874                         }
4875                         c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
4876                         if ((d[i++] = c) == '\0') {
4877                                 i--;
4878                                 break;
4879                         }
4880                 }
4881
4882                 for (;;) {
4883                         if (i >= size) {
4884                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4885                                 regs[rd] = 0;
4886                                 break;
4887                         }
4888                         c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
4889                         if ((d[i++] = c) == '\0')
4890                                 break;
4891                 }
4892
4893                 if (i < size) {
4894                         mstate->dtms_scratch_ptr += i;
4895                         regs[rd] = (uintptr_t)d;
4896                 }
4897
4898                 break;
4899         }
4900
4901         case DIF_SUBR_STRTOLL: {
4902                 uintptr_t s = tupregs[0].dttk_value;
4903                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4904                 size_t lim;
4905                 int base = 10;
4906
4907                 if (nargs > 1) {
4908                         if ((base = tupregs[1].dttk_value) <= 1 ||
4909                             base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4910                                 *flags |= CPU_DTRACE_ILLOP;
4911                                 break;
4912                         }
4913                 }
4914
4915                 if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
4916                         regs[rd] = INT64_MIN;
4917                         break;
4918                 }
4919
4920                 regs[rd] = dtrace_strtoll((char *)s, base, lim);
4921                 break;
4922         }
4923
4924         case DIF_SUBR_LLTOSTR: {
4925                 int64_t i = (int64_t)tupregs[0].dttk_value;
4926                 uint64_t val, digit;
4927                 uint64_t size = 65;     /* enough room for 2^64 in binary */
4928                 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4929                 int base = 10;
4930
4931                 if (nargs > 1) {
4932                         if ((base = tupregs[1].dttk_value) <= 1 ||
4933                              base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4934                                 *flags |= CPU_DTRACE_ILLOP;
4935                                 break;
4936                         }
4937                 }
4938
4939                 val = (base == 10 && i < 0) ? i * -1 : i;
4940
4941                 if (!DTRACE_INSCRATCH(mstate, size)) {
4942                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4943                         regs[rd] = 0;
4944                         break;
4945                 }
4946
4947                 for (*end-- = '\0'; val; val /= base) {
4948                         if ((digit = val % base) <= '9' - '0') {
4949                                 *end-- = '0' + digit;
4950                         } else {
4951                                 *end-- = 'a' + (digit - ('9' - '0') - 1);
4952                         }
4953                 }
4954
4955                 if (i == 0 && base == 16)
4956                         *end-- = '0';
4957
4958                 if (base == 16)
4959                         *end-- = 'x';
4960
4961                 if (i == 0 || base == 8 || base == 16)
4962                         *end-- = '0';
4963
4964                 if (i < 0 && base == 10)
4965                         *end-- = '-';
4966
4967                 regs[rd] = (uintptr_t)end + 1;
4968                 mstate->dtms_scratch_ptr += size;
4969                 break;
4970         }
4971
4972         case DIF_SUBR_HTONS:
4973         case DIF_SUBR_NTOHS:
4974 #ifdef _BIG_ENDIAN
4975                 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4976 #else
4977                 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4978 #endif
4979                 break;
4980
4981
4982         case DIF_SUBR_HTONL:
4983         case DIF_SUBR_NTOHL:
4984 #ifdef _BIG_ENDIAN
4985                 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4986 #else
4987                 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4988 #endif
4989                 break;
4990
4991
4992         case DIF_SUBR_HTONLL:
4993         case DIF_SUBR_NTOHLL:
4994 #ifdef _BIG_ENDIAN
4995                 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4996 #else
4997                 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4998 #endif
4999                 break;
5000
5001
5002         case DIF_SUBR_DIRNAME:
5003         case DIF_SUBR_BASENAME: {
5004                 char *dest = (char *)mstate->dtms_scratch_ptr;
5005                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5006                 uintptr_t src = tupregs[0].dttk_value;
5007                 int i, j, len = dtrace_strlen((char *)src, size);
5008                 int lastbase = -1, firstbase = -1, lastdir = -1;
5009                 int start, end;
5010
5011                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5012                         regs[rd] = 0;
5013                         break;
5014                 }
5015
5016                 if (!DTRACE_INSCRATCH(mstate, size)) {
5017                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5018                         regs[rd] = 0;
5019                         break;
5020                 }
5021
5022                 /*
5023                  * The basename and dirname for a zero-length string is
5024                  * defined to be "."
5025                  */
5026                 if (len == 0) {
5027                         len = 1;
5028                         src = (uintptr_t)".";
5029                 }
5030
5031                 /*
5032                  * Start from the back of the string, moving back toward the
5033                  * front until we see a character that isn't a slash.  That
5034                  * character is the last character in the basename.
5035                  */
5036                 for (i = len - 1; i >= 0; i--) {
5037                         if (dtrace_load8(src + i) != '/')
5038                                 break;
5039                 }
5040
5041                 if (i >= 0)
5042                         lastbase = i;
5043
5044                 /*
5045                  * Starting from the last character in the basename, move
5046                  * towards the front until we find a slash.  The character
5047                  * that we processed immediately before that is the first
5048                  * character in the basename.
5049                  */
5050                 for (; i >= 0; i--) {
5051                         if (dtrace_load8(src + i) == '/')
5052                                 break;
5053                 }
5054
5055                 if (i >= 0)
5056                         firstbase = i + 1;
5057
5058                 /*
5059                  * Now keep going until we find a non-slash character.  That
5060                  * character is the last character in the dirname.
5061                  */
5062                 for (; i >= 0; i--) {
5063                         if (dtrace_load8(src + i) != '/')
5064                                 break;
5065                 }
5066
5067                 if (i >= 0)
5068                         lastdir = i;
5069
5070                 ASSERT(!(lastbase == -1 && firstbase != -1));
5071                 ASSERT(!(firstbase == -1 && lastdir != -1));
5072
5073                 if (lastbase == -1) {
5074                         /*
5075                          * We didn't find a non-slash character.  We know that
5076                          * the length is non-zero, so the whole string must be
5077                          * slashes.  In either the dirname or the basename
5078                          * case, we return '/'.
5079                          */
5080                         ASSERT(firstbase == -1);
5081                         firstbase = lastbase = lastdir = 0;
5082                 }
5083
5084                 if (firstbase == -1) {
5085                         /*
5086                          * The entire string consists only of a basename
5087                          * component.  If we're looking for dirname, we need
5088                          * to change our string to be just "."; if we're
5089                          * looking for a basename, we'll just set the first
5090                          * character of the basename to be 0.
5091                          */
5092                         if (subr == DIF_SUBR_DIRNAME) {
5093                                 ASSERT(lastdir == -1);
5094                                 src = (uintptr_t)".";
5095                                 lastdir = 0;
5096                         } else {
5097                                 firstbase = 0;
5098                         }
5099                 }
5100
5101                 if (subr == DIF_SUBR_DIRNAME) {
5102                         if (lastdir == -1) {
5103                                 /*
5104                                  * We know that we have a slash in the name --
5105                                  * or lastdir would be set to 0, above.  And
5106                                  * because lastdir is -1, we know that this
5107                                  * slash must be the first character.  (That
5108                                  * is, the full string must be of the form
5109                                  * "/basename".)  In this case, the last
5110                                  * character of the directory name is 0.
5111                                  */
5112                                 lastdir = 0;
5113                         }
5114
5115                         start = 0;
5116                         end = lastdir;
5117                 } else {
5118                         ASSERT(subr == DIF_SUBR_BASENAME);
5119                         ASSERT(firstbase != -1 && lastbase != -1);
5120                         start = firstbase;
5121                         end = lastbase;
5122                 }
5123
5124                 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
5125                         dest[j] = dtrace_load8(src + i);
5126
5127                 dest[j] = '\0';
5128                 regs[rd] = (uintptr_t)dest;
5129                 mstate->dtms_scratch_ptr += size;
5130                 break;
5131         }
5132
5133         case DIF_SUBR_CLEANPATH: {
5134                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5135                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5136                 uintptr_t src = tupregs[0].dttk_value;
5137                 size_t lim;
5138                 size_t i = 0, j = 0;
5139
5140                 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5141                         regs[rd] = 0;
5142                         break;
5143                 }
5144
5145                 if (!DTRACE_INSCRATCH(mstate, size)) {
5146                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5147                         regs[rd] = 0;
5148                         break;
5149                 }
5150
5151                 /*
5152                  * Move forward, loading each character.
5153                  */
5154                 do {
5155                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5156 next:
5157                         if ((uint64_t)(j + 5) >= size)  /* 5 = strlen("/..c\0") */
5158                                 break;
5159
5160                         if (c != '/') {
5161                                 dest[j++] = c;
5162                                 continue;
5163                         }
5164
5165                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5166
5167                         if (c == '/') {
5168                                 /*
5169                                  * We have two slashes -- we can just advance
5170                                  * to the next character.
5171                                  */
5172                                 goto next;
5173                         }
5174
5175                         if (c != '.') {
5176                                 /*
5177                                  * This is not "." and it's not ".." -- we can
5178                                  * just store the "/" and this character and
5179                                  * drive on.
5180                                  */
5181                                 dest[j++] = '/';
5182                                 dest[j++] = c;
5183                                 continue;
5184                         }
5185
5186                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5187
5188                         if (c == '/') {
5189                                 /*
5190                                  * This is a "/./" component.  We're not going
5191                                  * to store anything in the destination buffer;
5192                                  * we're just going to go to the next component.
5193                                  */
5194                                 goto next;
5195                         }
5196
5197                         if (c != '.') {
5198                                 /*
5199                                  * This is not ".." -- we can just store the
5200                                  * "/." and this character and continue
5201                                  * processing.
5202                                  */
5203                                 dest[j++] = '/';
5204                                 dest[j++] = '.';
5205                                 dest[j++] = c;
5206                                 continue;
5207                         }
5208
5209                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5210
5211                         if (c != '/' && c != '\0') {
5212                                 /*
5213                                  * This is not ".." -- it's "..[mumble]".
5214                                  * We'll store the "/.." and this character
5215                                  * and continue processing.
5216                                  */
5217                                 dest[j++] = '/';
5218                                 dest[j++] = '.';
5219                                 dest[j++] = '.';
5220                                 dest[j++] = c;
5221                                 continue;
5222                         }
5223
5224                         /*
5225                          * This is "/../" or "/..\0".  We need to back up
5226                          * our destination pointer until we find a "/".
5227                          */
5228                         i--;
5229                         while (j != 0 && dest[--j] != '/')
5230                                 continue;
5231
5232                         if (c == '\0')
5233                                 dest[++j] = '/';
5234                 } while (c != '\0');
5235
5236                 dest[j] = '\0';
5237                 regs[rd] = (uintptr_t)dest;
5238                 mstate->dtms_scratch_ptr += size;
5239                 break;
5240         }
5241
5242         case DIF_SUBR_INET_NTOA:
5243         case DIF_SUBR_INET_NTOA6:
5244         case DIF_SUBR_INET_NTOP: {
5245                 size_t size;
5246                 int af, argi, i;
5247                 char *base, *end;
5248
5249                 if (subr == DIF_SUBR_INET_NTOP) {
5250                         af = (int)tupregs[0].dttk_value;
5251                         argi = 1;
5252                 } else {
5253                         af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5254                         argi = 0;
5255                 }
5256
5257                 if (af == AF_INET) {
5258 #if !defined(__APPLE__)
5259                         ipaddr_t ip4;
5260 #else
5261                         uint32_t ip4;
5262 #endif /* __APPLE__ */
5263                         uint8_t *ptr8, val;
5264
5265                         /*
5266                          * Safely load the IPv4 address.
5267                          */
5268 #if !defined(__APPLE__)
5269                         ip4 = dtrace_load32(tupregs[argi].dttk_value);
5270 #else
5271                         if (!dtrace_canload(tupregs[argi].dttk_value, sizeof(ip4),
5272                                 mstate, vstate)) {
5273                                 regs[rd] = 0;
5274                                 break;
5275                         }
5276
5277                         dtrace_bcopy(
5278                             (void *)(uintptr_t)tupregs[argi].dttk_value,
5279                             (void *)(uintptr_t)&ip4, sizeof (ip4));
5280 #endif /* __APPLE__ */
5281                         /*
5282                          * Check an IPv4 string will fit in scratch.
5283                          */
5284 #if !defined(__APPLE__)
5285                         size = INET_ADDRSTRLEN;
5286 #else
5287                         size = MAX_IPv4_STR_LEN;
5288 #endif /* __APPLE__ */
5289                         if (!DTRACE_INSCRATCH(mstate, size)) {
5290                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5291                                 regs[rd] = 0;
5292                                 break;
5293                         }
5294                         base = (char *)mstate->dtms_scratch_ptr;
5295                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
5296
5297                         /*
5298                          * Stringify as a dotted decimal quad.
5299                          */
5300                         *end-- = '\0';
5301                         ptr8 = (uint8_t *)&ip4;
5302                         for (i = 3; i >= 0; i--) {
5303                                 val = ptr8[i];
5304
5305                                 if (val == 0) {
5306                                         *end-- = '0';
5307                                 } else {
5308                                         for (; val; val /= 10) {
5309                                                 *end-- = '0' + (val % 10);
5310                                         }
5311                                 }
5312
5313                                 if (i > 0)
5314                                         *end-- = '.';
5315                         }
5316                         ASSERT(end + 1 >= base);
5317
5318                 } else if (af == AF_INET6) {
5319 #if defined(__APPLE__)
5320 #define _S6_un __u6_addr
5321 #define _S6_u8 __u6_addr8
5322 #endif /* __APPLE__ */
5323                         struct in6_addr ip6;
5324                         int firstzero, tryzero, numzero, v6end;
5325                         uint16_t val;
5326                         const char digits[] = "0123456789abcdef";
5327
5328                         /*
5329                          * Stringify using RFC 1884 convention 2 - 16 bit
5330                          * hexadecimal values with a zero-run compression.
5331                          * Lower case hexadecimal digits are used.
5332                          *      eg, fe80::214:4fff:fe0b:76c8.
5333                          * The IPv4 embedded form is returned for inet_ntop,
5334                          * just the IPv4 string is returned for inet_ntoa6.
5335                          */
5336
5337                         if (!dtrace_canload(tupregs[argi].dttk_value,
5338                                 sizeof(struct in6_addr), mstate, vstate)) {
5339                                 regs[rd] = 0;
5340                                 break;
5341                         }
5342
5343                         /*
5344                          * Safely load the IPv6 address.
5345                          */
5346                         dtrace_bcopy(
5347                             (void *)(uintptr_t)tupregs[argi].dttk_value,
5348                             (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5349
5350                         /*
5351                          * Check an IPv6 string will fit in scratch.
5352                          */
5353                         size = INET6_ADDRSTRLEN;
5354                         if (!DTRACE_INSCRATCH(mstate, size)) {
5355                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5356                                 regs[rd] = 0;
5357                                 break;
5358                         }
5359                         base = (char *)mstate->dtms_scratch_ptr;
5360                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
5361                         *end-- = '\0';
5362
5363                         /*
5364                          * Find the longest run of 16 bit zero values
5365                          * for the single allowed zero compression - "::".
5366                          */
5367                         firstzero = -1;
5368                         tryzero = -1;
5369                         numzero = 1;
5370                         for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
5371                                 if (ip6._S6_un._S6_u8[i] == 0 &&
5372                                     tryzero == -1 && i % 2 == 0) {
5373                                         tryzero = i;
5374                                         continue;
5375                                 }
5376
5377                                 if (tryzero != -1 &&
5378                                     (ip6._S6_un._S6_u8[i] != 0 ||
5379                                     i == sizeof (struct in6_addr) - 1)) {
5380
5381                                         if (i - tryzero <= numzero) {
5382                                                 tryzero = -1;
5383                                                 continue;
5384                                         }
5385
5386                                         firstzero = tryzero;
5387                                         numzero = i - i % 2 - tryzero;
5388                                         tryzero = -1;
5389
5390                                         if (ip6._S6_un._S6_u8[i] == 0 &&
5391                                             i == sizeof (struct in6_addr) - 1)
5392                                                 numzero += 2;
5393                                 }
5394                         }
5395                         ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
5396
5397                         /*
5398                          * Check for an IPv4 embedded address.
5399                          */
5400                         v6end = sizeof (struct in6_addr) - 2;
5401                         if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5402                             IN6_IS_ADDR_V4COMPAT(&ip6)) {
5403                                 for (i = sizeof (struct in6_addr) - 1;
5404                                      i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
5405                                         ASSERT(end >= base);
5406
5407                                         val = ip6._S6_un._S6_u8[i];
5408
5409                                         if (val == 0) {
5410                                                 *end-- = '0';
5411                                         } else {
5412                                                 for (; val; val /= 10) {
5413                                                         *end-- = '0' + val % 10;
5414                                                 }
5415                                         }
5416
5417                                         if (i > (int)DTRACE_V4MAPPED_OFFSET)
5418                                                 *end-- = '.';
5419                                 }
5420
5421                                 if (subr == DIF_SUBR_INET_NTOA6)
5422                                         goto inetout;
5423
5424                                 /*
5425                                  * Set v6end to skip the IPv4 address that
5426                                  * we have already stringified.
5427                                  */
5428                                 v6end = 10;
5429                         }
5430
5431                         /*
5432                          * Build the IPv6 string by working through the
5433                          * address in reverse.
5434                          */
5435                         for (i = v6end; i >= 0; i -= 2) {
5436                                 ASSERT(end >= base);
5437
5438                                 if (i == firstzero + numzero - 2) {
5439                                         *end-- = ':';
5440                                         *end-- = ':';
5441                                         i -= numzero - 2;
5442                                         continue;
5443                                 }
5444
5445                                 if (i < 14 && i != firstzero - 2)
5446                                         *end-- = ':';
5447
5448                                 val = (ip6._S6_un._S6_u8[i] << 8) +
5449                                     ip6._S6_un._S6_u8[i + 1];
5450
5451                                 if (val == 0) {
5452                                         *end-- = '0';
5453                                 } else {
5454                                         for (; val; val /= 16) {
5455                                                 *end-- = digits[val % 16];
5456                                         }
5457                                 }
5458                         }
5459                         ASSERT(end + 1 >= base);
5460
5461 #if defined(__APPLE__)
5462 #undef _S6_un
5463 #undef _S6_u8
5464 #endif /* __APPLE__ */
5465                 } else {
5466                         /*
5467                          * The user didn't use AH_INET or AH_INET6.
5468                          */
5469                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5470                         regs[rd] = 0;
5471                         break;
5472                 }
5473
5474 inetout:        regs[rd] = (uintptr_t)end + 1;
5475                 mstate->dtms_scratch_ptr += size;
5476                 break;
5477         }
5478
5479         case DIF_SUBR_JSON: {
5480                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5481                 uintptr_t json = tupregs[0].dttk_value;
5482                 size_t jsonlen = dtrace_strlen((char *)json, size);
5483                 uintptr_t elem = tupregs[1].dttk_value;
5484                 size_t elemlen = dtrace_strlen((char *)elem, size);
5485
5486                 char *dest = (char *)mstate->dtms_scratch_ptr;
5487                 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
5488                 char *ee = elemlist;
5489                 int nelems = 1;
5490                 uintptr_t cur;
5491
5492                 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
5493                     !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
5494                         regs[rd] = 0;
5495                         break;
5496                 }
5497
5498                 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
5499                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5500                         regs[rd] = 0;
5501                         break;
5502                 }
5503
5504                 /*
5505                  * Read the element selector and split it up into a packed list
5506                  * of strings.
5507                  */
5508                 for (cur = elem; cur < elem + elemlen; cur++) {
5509                         char cc = dtrace_load8(cur);
5510
5511                         if (cur == elem && cc == '[') {
5512                                 /*
5513                                  * If the first element selector key is
5514                                  * actually an array index then ignore the
5515                                  * bracket.
5516                                  */
5517                                 continue;
5518                         }
5519
5520                         if (cc == ']')
5521                                 continue;
5522
5523                         if (cc == '.' || cc == '[') {
5524                                 nelems++;
5525                                 cc = '\0';
5526                         }
5527
5528                         *ee++ = cc;
5529                 }
5530                 *ee++ = '\0';
5531
5532                 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
5533                     nelems, dest)) != 0)
5534                         mstate->dtms_scratch_ptr += jsonlen + 1;
5535                 break;
5536         }
5537
5538         case DIF_SUBR_TOUPPER:
5539         case DIF_SUBR_TOLOWER: {
5540                 uintptr_t src = tupregs[0].dttk_value;
5541                 char *dest = (char *)mstate->dtms_scratch_ptr;
5542                 char lower, upper, base, c;
5543                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5544                 size_t len = dtrace_strlen((char*) src, size);
5545                 size_t i = 0;
5546
5547                 lower = (subr == DIF_SUBR_TOUPPER) ? 'a' : 'A';
5548                 upper = (subr == DIF_SUBR_TOUPPER) ? 'z' : 'Z';
5549                 base  = (subr == DIF_SUBR_TOUPPER) ? 'A' : 'a';
5550
5551                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5552                         regs[rd] = 0;
5553                         break;
5554                 }
5555
5556                 if (!DTRACE_INSCRATCH(mstate, size)) {
5557                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5558                         regs[rd] = 0;
5559                         break;
5560                 }
5561
5562                 for (i = 0; i < size - 1; ++i) {
5563                         if ((c = dtrace_load8(src + i)) == '\0')
5564                                 break;
5565                         if (c >= lower && c <= upper)
5566                                 c = base + (c - lower);
5567                         dest[i] = c;
5568                 }
5569
5570                 ASSERT(i < size);
5571
5572                 dest[i] = '\0';
5573                 regs[rd] = (uintptr_t) dest;
5574                 mstate->dtms_scratch_ptr += size;
5575
5576                 break;
5577         }
5578         case DIF_SUBR_STRIP:
5579                 if (!dtrace_is_valid_ptrauth_key(tupregs[1].dttk_value)) {
5580                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5581                         break;
5582                 }
5583                 regs[rd] = (uint64_t)dtrace_ptrauth_strip(
5584                     (void*)tupregs[0].dttk_value, tupregs[1].dttk_value);
5585                 break;
5586
5587 #if defined(__APPLE__)
5588         case DIF_SUBR_VM_KERNEL_ADDRPERM: {
5589                 if (!dtrace_priv_kernel(state)) {
5590                         regs[rd] = 0;
5591                 } else {
5592                         regs[rd] = VM_KERNEL_ADDRPERM((vm_offset_t) tupregs[0].dttk_value);
5593                 }
5594
5595                 break;
5596         }
5597
5598         case DIF_SUBR_KDEBUG_TRACE: {
5599                 uint32_t debugid;
5600                 uintptr_t args[4] = {0};
5601                 int i;
5602
5603                 if (nargs < 2 || nargs > 5) {
5604                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5605                         break;
5606                 }
5607
5608                 if (dtrace_destructive_disallow)
5609                         return;
5610
5611                 debugid = tupregs[0].dttk_value;
5612                 for (i = 0; i < nargs - 1; i++)
5613                         args[i] = tupregs[i + 1].dttk_value;
5614
5615                 kernel_debug(debugid, args[0], args[1], args[2], args[3], 0);
5616
5617                 break;
5618         }
5619
5620         case DIF_SUBR_KDEBUG_TRACE_STRING: {
5621                 if (nargs != 3) {
5622                         break;
5623                 }
5624
5625                 if (dtrace_destructive_disallow)
5626                         return;
5627
5628                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5629                 uint32_t debugid = tupregs[0].dttk_value;
5630                 uint64_t str_id = tupregs[1].dttk_value;
5631                 uintptr_t src = tupregs[2].dttk_value;
5632                 size_t lim;
5633                 char buf[size];
5634                 char* str = NULL;
5635
5636                 if (src != (uintptr_t)0) {
5637                         str = buf;
5638                         if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5639                                 break;
5640                         }
5641                         dtrace_strcpy((void*)src, buf, size);
5642                 }
5643
5644                 (void)kernel_debug_string(debugid, &str_id, str);
5645                 regs[rd] = str_id;
5646
5647                 break;
5648         }
5649 #endif
5650
5651         }
5652 }
5653
5654 /*
5655  * Emulate the execution of DTrace IR instructions specified by the given
5656  * DIF object.  This function is deliberately void of assertions as all of
5657  * the necessary checks are handled by a call to dtrace_difo_validate().
5658  */
5659 static uint64_t
5660 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5661     dtrace_vstate_t *vstate, dtrace_state_t *state)
5662 {
5663         const dif_instr_t *text = difo->dtdo_buf;
5664         const uint_t textlen = difo->dtdo_len;
5665         const char *strtab = difo->dtdo_strtab;
5666         const uint64_t *inttab = difo->dtdo_inttab;
5667
5668         uint64_t rval = 0;
5669         dtrace_statvar_t *svar;
5670         dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5671         dtrace_difv_t *v;
5672         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5673         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5674
5675         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5676         uint64_t regs[DIF_DIR_NREGS];
5677         uint64_t *tmp;
5678
5679         uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5680         int64_t cc_r;
5681         uint_t pc = 0, id, opc = 0;
5682         uint8_t ttop = 0;
5683         dif_instr_t instr;
5684         uint_t r1, r2, rd;
5685
5686         /*
5687          * We stash the current DIF object into the machine state: we need it
5688          * for subsequent access checking.
5689          */
5690         mstate->dtms_difo = difo;
5691
5692         regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
5693
5694         while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5695                 opc = pc;
5696
5697                 instr = text[pc++];
5698                 r1 = DIF_INSTR_R1(instr);
5699                 r2 = DIF_INSTR_R2(instr);
5700                 rd = DIF_INSTR_RD(instr);
5701
5702                 switch (DIF_INSTR_OP(instr)) {
5703                 case DIF_OP_OR:
5704                         regs[rd] = regs[r1] | regs[r2];
5705                         break;
5706                 case DIF_OP_XOR:
5707                         regs[rd] = regs[r1] ^ regs[r2];
5708                         break;
5709                 case DIF_OP_AND:
5710                         regs[rd] = regs[r1] & regs[r2];
5711                         break;
5712                 case DIF_OP_SLL:
5713                         regs[rd] = regs[r1] << regs[r2];
5714                         break;
5715                 case DIF_OP_SRL:
5716                         regs[rd] = regs[r1] >> regs[r2];
5717                         break;
5718                 case DIF_OP_SUB:
5719                         regs[rd] = regs[r1] - regs[r2];
5720                         break;
5721                 case DIF_OP_ADD:
5722                         regs[rd] = regs[r1] + regs[r2];
5723                         break;
5724                 case DIF_OP_MUL:
5725                         regs[rd] = regs[r1] * regs[r2];
5726                         break;
5727                 case DIF_OP_SDIV:
5728                         if (regs[r2] == 0) {
5729                                 regs[rd] = 0;
5730                                 *flags |= CPU_DTRACE_DIVZERO;
5731                         } else {
5732                                 regs[rd] = (int64_t)regs[r1] /
5733                                     (int64_t)regs[r2];
5734                         }
5735                         break;
5736
5737                 case DIF_OP_UDIV:
5738                         if (regs[r2] == 0) {
5739                                 regs[rd] = 0;
5740                                 *flags |= CPU_DTRACE_DIVZERO;
5741                         } else {
5742                                 regs[rd] = regs[r1] / regs[r2];
5743                         }
5744                         break;
5745
5746                 case DIF_OP_SREM:
5747                         if (regs[r2] == 0) {
5748                                 regs[rd] = 0;
5749                                 *flags |= CPU_DTRACE_DIVZERO;
5750                         } else {
5751                                 regs[rd] = (int64_t)regs[r1] %
5752                                     (int64_t)regs[r2];
5753                         }
5754                         break;
5755
5756                 case DIF_OP_UREM:
5757                         if (regs[r2] == 0) {
5758                                 regs[rd] = 0;
5759                                 *flags |= CPU_DTRACE_DIVZERO;
5760                         } else {
5761                                 regs[rd] = regs[r1] % regs[r2];
5762                         }
5763                         break;
5764
5765                 case DIF_OP_NOT:
5766                         regs[rd] = ~regs[r1];
5767                         break;
5768                 case DIF_OP_MOV:
5769                         regs[rd] = regs[r1];
5770                         break;
5771                 case DIF_OP_CMP:
5772                         cc_r = regs[r1] - regs[r2];
5773                         cc_n = cc_r < 0;
5774                         cc_z = cc_r == 0;
5775                         cc_v = 0;
5776                         cc_c = regs[r1] < regs[r2];
5777                         break;
5778                 case DIF_OP_TST:
5779                         cc_n = cc_v = cc_c = 0;
5780                         cc_z = regs[r1] == 0;
5781                         break;
5782                 case DIF_OP_BA:
5783                         pc = DIF_INSTR_LABEL(instr);
5784                         break;
5785                 case DIF_OP_BE:
5786                         if (cc_z)
5787                                 pc = DIF_INSTR_LABEL(instr);
5788                         break;
5789                 case DIF_OP_BNE:
5790                         if (cc_z == 0)
5791                                 pc = DIF_INSTR_LABEL(instr);
5792                         break;
5793                 case DIF_OP_BG:
5794                         if ((cc_z | (cc_n ^ cc_v)) == 0)
5795                                 pc = DIF_INSTR_LABEL(instr);
5796                         break;
5797                 case DIF_OP_BGU:
5798                         if ((cc_c | cc_z) == 0)
5799                                 pc = DIF_INSTR_LABEL(instr);
5800                         break;
5801                 case DIF_OP_BGE:
5802                         if ((cc_n ^ cc_v) == 0)
5803                                 pc = DIF_INSTR_LABEL(instr);
5804                         break;
5805                 case DIF_OP_BGEU:
5806                         if (cc_c == 0)
5807                                 pc = DIF_INSTR_LABEL(instr);
5808                         break;
5809                 case DIF_OP_BL:
5810                         if (cc_n ^ cc_v)
5811                                 pc = DIF_INSTR_LABEL(instr);
5812                         break;
5813                 case DIF_OP_BLU:
5814                         if (cc_c)
5815                                 pc = DIF_INSTR_LABEL(instr);
5816                         break;
5817                 case DIF_OP_BLE:
5818                         if (cc_z | (cc_n ^ cc_v))
5819                                 pc = DIF_INSTR_LABEL(instr);
5820                         break;
5821                 case DIF_OP_BLEU:
5822                         if (cc_c | cc_z)
5823                                 pc = DIF_INSTR_LABEL(instr);
5824                         break;
5825                 case DIF_OP_RLDSB:
5826                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5827                                 *flags |= CPU_DTRACE_KPRIV;
5828                                 *illval = regs[r1];
5829                                 break;
5830                         }
5831                         /*FALLTHROUGH*/
5832                 case DIF_OP_LDSB:
5833                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5834                         break;
5835                 case DIF_OP_RLDSH:
5836                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5837                                 *flags |= CPU_DTRACE_KPRIV;
5838                                 *illval = regs[r1];
5839                                 break;
5840                         }
5841                         /*FALLTHROUGH*/
5842                 case DIF_OP_LDSH:
5843                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5844                         break;
5845                 case DIF_OP_RLDSW:
5846                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5847                                 *flags |= CPU_DTRACE_KPRIV;
5848                                 *illval = regs[r1];
5849                                 break;
5850                         }
5851                         /*FALLTHROUGH*/
5852                 case DIF_OP_LDSW:
5853                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5854                         break;
5855                 case DIF_OP_RLDUB:
5856                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5857                                 *flags |= CPU_DTRACE_KPRIV;
5858                                 *illval = regs[r1];
5859                                 break;
5860                         }
5861                         /*FALLTHROUGH*/
5862                 case DIF_OP_LDUB:
5863                         regs[rd] = dtrace_load8(regs[r1]);
5864                         break;
5865                 case DIF_OP_RLDUH:
5866                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5867                                 *flags |= CPU_DTRACE_KPRIV;
5868                                 *illval = regs[r1];
5869                                 break;
5870                         }
5871                         /*FALLTHROUGH*/
5872                 case DIF_OP_LDUH:
5873                         regs[rd] = dtrace_load16(regs[r1]);
5874                         break;
5875                 case DIF_OP_RLDUW:
5876                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5877                                 *flags |= CPU_DTRACE_KPRIV;
5878                                 *illval = regs[r1];
5879                                 break;
5880                         }
5881                         /*FALLTHROUGH*/
5882                 case DIF_OP_LDUW:
5883                         regs[rd] = dtrace_load32(regs[r1]);
5884                         break;
5885                 case DIF_OP_RLDX:
5886                         if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5887                                 *flags |= CPU_DTRACE_KPRIV;
5888                                 *illval = regs[r1];
5889                                 break;
5890                         }
5891                         /*FALLTHROUGH*/
5892                 case DIF_OP_LDX:
5893                         regs[rd] = dtrace_load64(regs[r1]);
5894                         break;
5895 /*
5896  * Darwin 32-bit kernel may fetch from 64-bit user.
5897  * Do not cast regs to uintptr_t
5898  * DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
5899  * DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
5900  */
5901                 case DIF_OP_ULDSB:
5902                         regs[rd] = (int8_t)
5903                             dtrace_fuword8(regs[r1]);
5904                         break;
5905                 case DIF_OP_ULDSH:
5906                         regs[rd] = (int16_t)
5907                             dtrace_fuword16(regs[r1]);
5908                         break;
5909                 case DIF_OP_ULDSW:
5910                         regs[rd] = (int32_t)
5911                             dtrace_fuword32(regs[r1]);
5912                         break;
5913                 case DIF_OP_ULDUB:
5914                         regs[rd] =
5915                             dtrace_fuword8(regs[r1]);
5916                         break;
5917                 case DIF_OP_ULDUH:
5918                         regs[rd] =
5919                             dtrace_fuword16(regs[r1]);
5920                         break;
5921                 case DIF_OP_ULDUW:
5922                         regs[rd] =
5923                             dtrace_fuword32(regs[r1]);
5924                         break;
5925                 case DIF_OP_ULDX:
5926                         regs[rd] =
5927                             dtrace_fuword64(regs[r1]);
5928                         break;
5929                 case DIF_OP_RET:
5930                         rval = regs[rd];
5931                         pc = textlen;
5932                         break;
5933                 case DIF_OP_NOP:
5934                         break;
5935                 case DIF_OP_SETX:
5936                         regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5937                         break;
5938                 case DIF_OP_SETS:
5939                         regs[rd] = (uint64_t)(uintptr_t)
5940                             (strtab + DIF_INSTR_STRING(instr));
5941                         break;
5942                 case DIF_OP_SCMP: {
5943                         size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5944                         uintptr_t s1 = regs[r1];
5945                         uintptr_t s2 = regs[r2];
5946                         size_t lim1 = sz, lim2 = sz;
5947
5948                         if (s1 != 0 &&
5949                             !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
5950                                 break;
5951                         if (s2 != 0 &&
5952                             !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
5953                                 break;
5954
5955                         cc_r = dtrace_strncmp((char *)s1, (char *)s2,
5956                                 MIN(lim1, lim2));
5957
5958                         cc_n = cc_r < 0;
5959                         cc_z = cc_r == 0;
5960                         cc_v = cc_c = 0;
5961                         break;
5962                 }
5963                 case DIF_OP_LDGA:
5964                         regs[rd] = dtrace_dif_variable(mstate, state,
5965                             r1, regs[r2]);
5966                         break;
5967                 case DIF_OP_LDGS:
5968                         id = DIF_INSTR_VAR(instr);
5969
5970                         if (id >= DIF_VAR_OTHER_UBASE) {
5971                                 uintptr_t a;
5972
5973                                 id -= DIF_VAR_OTHER_UBASE;
5974                                 svar = vstate->dtvs_globals[id];
5975                                 ASSERT(svar != NULL);
5976                                 v = &svar->dtsv_var;
5977
5978                                 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5979                                         regs[rd] = svar->dtsv_data;
5980                                         break;
5981                                 }
5982
5983                                 a = (uintptr_t)svar->dtsv_data;
5984
5985                                 if (*(uint8_t *)a == UINT8_MAX) {
5986                                         /*
5987                                          * If the 0th byte is set to UINT8_MAX
5988                                          * then this is to be treated as a
5989                                          * reference to a NULL variable.
5990                                          */
5991                                         regs[rd] = 0;
5992                                 } else {
5993                                         regs[rd] = a + sizeof (uint64_t);
5994                                 }
5995
5996                                 break;
5997                         }
5998
5999                         regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6000                         break;
6001
6002                 case DIF_OP_STGS:
6003                         id = DIF_INSTR_VAR(instr);
6004
6005                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6006                         id -= DIF_VAR_OTHER_UBASE;
6007
6008                         VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6009                         svar = vstate->dtvs_globals[id];
6010                         ASSERT(svar != NULL);
6011                         v = &svar->dtsv_var;
6012
6013                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6014                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6015                                 size_t lim;
6016
6017                                 ASSERT(a != 0);
6018                                 ASSERT(svar->dtsv_size != 0);
6019
6020                                 if (regs[rd] == 0) {
6021                                         *(uint8_t *)a = UINT8_MAX;
6022                                         break;
6023                                 } else {
6024                                         *(uint8_t *)a = 0;
6025                                         a += sizeof (uint64_t);
6026                                 }
6027                                 if (!dtrace_vcanload(
6028                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6029                                         &lim, mstate, vstate))
6030                                         break;
6031
6032                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6033                                     (void *)a, &v->dtdv_type, lim);
6034                                 break;
6035                         }
6036
6037                         svar->dtsv_data = regs[rd];
6038                         break;
6039
6040                 case DIF_OP_LDTA:
6041                         /*
6042                          * There are no DTrace built-in thread-local arrays at
6043                          * present.  This opcode is saved for future work.
6044                          */
6045                         *flags |= CPU_DTRACE_ILLOP;
6046                         regs[rd] = 0;
6047                         break;
6048
6049                 case DIF_OP_LDLS:
6050                         id = DIF_INSTR_VAR(instr);
6051
6052                         if (id < DIF_VAR_OTHER_UBASE) {
6053                                 /*
6054                                  * For now, this has no meaning.
6055                                  */
6056                                 regs[rd] = 0;
6057                                 break;
6058                         }
6059
6060                         id -= DIF_VAR_OTHER_UBASE;
6061
6062                         ASSERT(id < (uint_t)vstate->dtvs_nlocals);
6063                         ASSERT(vstate->dtvs_locals != NULL);
6064                         svar = vstate->dtvs_locals[id];
6065                         ASSERT(svar != NULL);
6066                         v = &svar->dtsv_var;
6067
6068                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6069                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6070                                 size_t sz = v->dtdv_type.dtdt_size;
6071
6072                                 sz += sizeof (uint64_t);
6073                                 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6074                                 a += CPU->cpu_id * sz;
6075
6076                                 if (*(uint8_t *)a == UINT8_MAX) {
6077                                         /*
6078                                          * If the 0th byte is set to UINT8_MAX
6079                                          * then this is to be treated as a
6080                                          * reference to a NULL variable.
6081                                          */
6082                                         regs[rd] = 0;
6083                                 } else {
6084                                         regs[rd] = a + sizeof (uint64_t);
6085                                 }
6086
6087                                 break;
6088                         }
6089
6090                         ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6091                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6092                         regs[rd] = tmp[CPU->cpu_id];
6093                         break;
6094
6095                 case DIF_OP_STLS:
6096                         id = DIF_INSTR_VAR(instr);
6097
6098                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6099                         id -= DIF_VAR_OTHER_UBASE;
6100                         VERIFY(id < (uint_t)vstate->dtvs_nlocals);
6101                         ASSERT(vstate->dtvs_locals != NULL);
6102                         svar = vstate->dtvs_locals[id];
6103                         ASSERT(svar != NULL);
6104                         v = &svar->dtsv_var;
6105
6106                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6107                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6108                                 size_t sz = v->dtdv_type.dtdt_size;
6109                                 size_t lim;
6110
6111                                 sz += sizeof (uint64_t);
6112                                 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6113                                 a += CPU->cpu_id * sz;
6114
6115                                 if (regs[rd] == 0) {
6116                                         *(uint8_t *)a = UINT8_MAX;
6117                                         break;
6118                                 } else {
6119                                         *(uint8_t *)a = 0;
6120                                         a += sizeof (uint64_t);
6121                                 }
6122
6123                                 if (!dtrace_vcanload(
6124                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6125                                     &lim, mstate, vstate))
6126                                         break;
6127
6128                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6129                                     (void *)a, &v->dtdv_type, lim);
6130                                 break;
6131                         }
6132
6133                         ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6134                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6135                         tmp[CPU->cpu_id] = regs[rd];
6136                         break;
6137
6138                 case DIF_OP_LDTS: {
6139                         dtrace_dynvar_t *dvar;
6140                         dtrace_key_t *key;
6141
6142                         id = DIF_INSTR_VAR(instr);
6143                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6144                         id -= DIF_VAR_OTHER_UBASE;
6145                         v = &vstate->dtvs_tlocals[id];
6146
6147                         key = &tupregs[DIF_DTR_NREGS];
6148                         key[0].dttk_value = (uint64_t)id;
6149                         key[0].dttk_size = 0;
6150                         DTRACE_TLS_THRKEY(key[1].dttk_value);
6151                         key[1].dttk_size = 0;
6152
6153                         dvar = dtrace_dynvar(dstate, 2, key,
6154                             sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6155                             mstate, vstate);
6156
6157                         if (dvar == NULL) {
6158                                 regs[rd] = 0;
6159                                 break;
6160                         }
6161
6162                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6163                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6164                         } else {
6165                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6166                         }
6167
6168                         break;
6169                 }
6170
6171                 case DIF_OP_STTS: {
6172                         dtrace_dynvar_t *dvar;
6173                         dtrace_key_t *key;
6174
6175                         id = DIF_INSTR_VAR(instr);
6176                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6177                         id -= DIF_VAR_OTHER_UBASE;
6178                         VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6179
6180                         key = &tupregs[DIF_DTR_NREGS];
6181                         key[0].dttk_value = (uint64_t)id;
6182                         key[0].dttk_size = 0;
6183                         DTRACE_TLS_THRKEY(key[1].dttk_value);
6184                         key[1].dttk_size = 0;
6185                         v = &vstate->dtvs_tlocals[id];
6186
6187                         dvar = dtrace_dynvar(dstate, 2, key,
6188                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6189                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6190                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
6191                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6192
6193                         /*
6194                          * Given that we're storing to thread-local data,
6195                          * we need to flush our predicate cache.
6196                          */
6197                         dtrace_set_thread_predcache(current_thread(), 0);
6198
6199                         if (dvar == NULL)
6200                                 break;
6201
6202                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6203                                 size_t lim;
6204
6205                                 if (!dtrace_vcanload(
6206                                     (void *)(uintptr_t)regs[rd],
6207                                     &v->dtdv_type, &lim, mstate, vstate))
6208                                         break;
6209
6210                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6211                                     dvar->dtdv_data, &v->dtdv_type, lim);
6212                         } else {
6213                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6214                         }
6215
6216                         break;
6217                 }
6218
6219                 case DIF_OP_SRA:
6220                         regs[rd] = (int64_t)regs[r1] >> regs[r2];
6221                         break;
6222
6223                 case DIF_OP_CALL:
6224                         dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6225                             regs, tupregs, ttop, mstate, state);
6226                         break;
6227
6228                 case DIF_OP_PUSHTR:
6229                         if (ttop == DIF_DTR_NREGS) {
6230                                 *flags |= CPU_DTRACE_TUPOFLOW;
6231                                 break;
6232                         }
6233
6234                         if (r1 == DIF_TYPE_STRING) {
6235                                 /*
6236                                  * If this is a string type and the size is 0,
6237                                  * we'll use the system-wide default string
6238                                  * size.  Note that we are _not_ looking at
6239                                  * the value of the DTRACEOPT_STRSIZE option;
6240                                  * had this been set, we would expect to have
6241                                  * a non-zero size value in the "pushtr".
6242                                  */
6243                                 tupregs[ttop].dttk_size =
6244                                     dtrace_strlen((char *)(uintptr_t)regs[rd],
6245                                     regs[r2] ? regs[r2] :
6246                                     dtrace_strsize_default) + 1;
6247                         } else {
6248                                 if (regs[r2] > LONG_MAX) {
6249                                         *flags |= CPU_DTRACE_ILLOP;
6250                                         break;
6251                                 }
6252                                 tupregs[ttop].dttk_size = regs[r2];
6253                         }
6254
6255                         tupregs[ttop++].dttk_value = regs[rd];
6256                         break;
6257
6258                 case DIF_OP_PUSHTV:
6259                         if (ttop == DIF_DTR_NREGS) {
6260                                 *flags |= CPU_DTRACE_TUPOFLOW;
6261                                 break;
6262                         }
6263
6264                         tupregs[ttop].dttk_value = regs[rd];
6265                         tupregs[ttop++].dttk_size = 0;
6266                         break;
6267
6268                 case DIF_OP_POPTS:
6269                         if (ttop != 0)
6270                                 ttop--;
6271                         break;
6272
6273                 case DIF_OP_FLUSHTS:
6274                         ttop = 0;
6275                         break;
6276
6277                 case DIF_OP_LDGAA:
6278                 case DIF_OP_LDTAA: {
6279                         dtrace_dynvar_t *dvar;
6280                         dtrace_key_t *key = tupregs;
6281                         uint_t nkeys = ttop;
6282
6283                         id = DIF_INSTR_VAR(instr);
6284                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6285                         id -= DIF_VAR_OTHER_UBASE;
6286
6287                         key[nkeys].dttk_value = (uint64_t)id;
6288                         key[nkeys++].dttk_size = 0;
6289
6290                         if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6291                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6292                                 key[nkeys++].dttk_size = 0;
6293                                 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6294                                 v = &vstate->dtvs_tlocals[id];
6295                         } else {
6296                                 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6297                                 v = &vstate->dtvs_globals[id]->dtsv_var;
6298                         }
6299
6300                         dvar = dtrace_dynvar(dstate, nkeys, key,
6301                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6302                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6303                             DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6304
6305                         if (dvar == NULL) {
6306                                 regs[rd] = 0;
6307                                 break;
6308                         }
6309
6310                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6311                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6312                         } else {
6313                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6314                         }
6315
6316                         break;
6317                 }
6318
6319                 case DIF_OP_STGAA:
6320                 case DIF_OP_STTAA: {
6321                         dtrace_dynvar_t *dvar;
6322                         dtrace_key_t *key = tupregs;
6323                         uint_t nkeys = ttop;
6324
6325                         id = DIF_INSTR_VAR(instr);
6326                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6327                         id -= DIF_VAR_OTHER_UBASE;
6328
6329                         key[nkeys].dttk_value = (uint64_t)id;
6330                         key[nkeys++].dttk_size = 0;
6331
6332                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6333                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6334                                 key[nkeys++].dttk_size = 0;
6335                                 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6336                                 v = &vstate->dtvs_tlocals[id];
6337                         } else {
6338                                 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6339                                 v = &vstate->dtvs_globals[id]->dtsv_var;
6340                         }
6341
6342                         dvar = dtrace_dynvar(dstate, nkeys, key,
6343                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6344                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6345                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
6346                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6347
6348                         if (dvar == NULL)
6349                                 break;
6350
6351                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6352                                 size_t lim;
6353
6354                                 if (!dtrace_vcanload(
6355                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6356                                     &lim, mstate, vstate))
6357                                         break;
6358
6359                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6360                                     dvar->dtdv_data, &v->dtdv_type, lim);
6361                         } else {
6362                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6363                         }
6364
6365                         break;
6366                 }
6367
6368                 case DIF_OP_ALLOCS: {
6369                         uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6370                         size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6371
6372                         /*
6373                          * Rounding up the user allocation size could have
6374                          * overflowed large, bogus allocations (like -1ULL) to
6375                          * 0.
6376                          */
6377                         if (size < regs[r1] ||
6378                             !DTRACE_INSCRATCH(mstate, size)) {
6379                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6380                                 regs[rd] = 0;
6381                                 break;
6382                         }
6383
6384                         dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6385                                 mstate->dtms_scratch_ptr += size;
6386                                 regs[rd] = ptr;
6387                         break;
6388                 }
6389
6390                 case DIF_OP_COPYS:
6391                         if (!dtrace_canstore(regs[rd], regs[r2],
6392                             mstate, vstate)) {
6393                                 *flags |= CPU_DTRACE_BADADDR;
6394                                 *illval = regs[rd];
6395                                 break;
6396                         }
6397
6398                         if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6399                                 break;
6400
6401                         dtrace_bcopy((void *)(uintptr_t)regs[r1],
6402                             (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6403                         break;
6404
6405                 case DIF_OP_STB:
6406                         if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6407                                 *flags |= CPU_DTRACE_BADADDR;
6408                                 *illval = regs[rd];
6409                                 break;
6410                         }
6411                         *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6412                         break;
6413
6414                 case DIF_OP_STH:
6415                         if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6416                                 *flags |= CPU_DTRACE_BADADDR;
6417                                 *illval = regs[rd];
6418                                 break;
6419                         }
6420                         if (regs[rd] & 1) {
6421                                 *flags |= CPU_DTRACE_BADALIGN;
6422                                 *illval = regs[rd];
6423                                 break;
6424                         }
6425                         *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6426                         break;
6427
6428                 case DIF_OP_STW:
6429                         if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6430                                 *flags |= CPU_DTRACE_BADADDR;
6431                                 *illval = regs[rd];
6432                                 break;
6433                         }
6434                         if (regs[rd] & 3) {
6435                                 *flags |= CPU_DTRACE_BADALIGN;
6436                                 *illval = regs[rd];
6437                                 break;
6438                         }
6439                         *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6440                         break;
6441
6442                 case DIF_OP_STX:
6443                         if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6444                                 *flags |= CPU_DTRACE_BADADDR;
6445                                 *illval = regs[rd];
6446                                 break;
6447                         }
6448
6449                         /*
6450                         * Darwin kmem_zalloc() called from
6451                         * dtrace_difo_init() is 4-byte aligned.
6452                         */
6453                         if (regs[rd] & 3) {
6454                                 *flags |= CPU_DTRACE_BADALIGN;
6455                                 *illval = regs[rd];
6456                                 break;
6457                         }
6458                         *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6459                         break;
6460                 case DIF_OP_STRIP:
6461                         regs[rd] = (uint64_t)dtrace_ptrauth_strip(
6462                             (void*)regs[r1], r2);
6463                         break;
6464                 }
6465         }
6466
6467         if (!(*flags & CPU_DTRACE_FAULT))
6468                 return (rval);
6469
6470         mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6471         mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6472
6473         return (0);
6474 }
6475
6476 static void
6477 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6478 {
6479         dtrace_probe_t *probe = ecb->dte_probe;
6480         dtrace_provider_t *prov = probe->dtpr_provider;
6481         char c[DTRACE_FULLNAMELEN + 80], *str;
6482         const char *msg = "dtrace: breakpoint action at probe ";
6483         const char *ecbmsg = " (ecb ";
6484         uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6485         uintptr_t val = (uintptr_t)ecb;
6486         int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6487
6488         if (dtrace_destructive_disallow)
6489                 return;
6490
6491         /*
6492          * It's impossible to be taking action on the NULL probe.
6493          */
6494         ASSERT(probe != NULL);
6495
6496         /*
6497          * This is a poor man's (destitute man's?) sprintf():  we want to
6498          * print the provider name, module name, function name and name of
6499          * the probe, along with the hex address of the ECB with the breakpoint
6500          * action -- all of which we must place in the character buffer by
6501          * hand.
6502          */
6503         while (*msg != '\0')
6504                 c[i++] = *msg++;
6505
6506         for (str = prov->dtpv_name; *str != '\0'; str++)
6507                 c[i++] = *str;
6508         c[i++] = ':';
6509
6510         for (str = probe->dtpr_mod; *str != '\0'; str++)
6511                 c[i++] = *str;
6512         c[i++] = ':';
6513
6514         for (str = probe->dtpr_func; *str != '\0'; str++)
6515                 c[i++] = *str;
6516         c[i++] = ':';
6517
6518         for (str = probe->dtpr_name; *str != '\0'; str++)
6519                 c[i++] = *str;
6520
6521         while (*ecbmsg != '\0')
6522                 c[i++] = *ecbmsg++;
6523
6524         while (shift >= 0) {
6525                 mask = (uintptr_t)0xf << shift;
6526
6527                 if (val >= ((uintptr_t)1 << shift))
6528                         c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6529                 shift -= 4;
6530         }
6531
6532         c[i++] = ')';
6533         c[i] = '\0';
6534
6535         debug_enter(c);
6536 }
6537
6538 static void
6539 dtrace_action_panic(dtrace_ecb_t *ecb)
6540 {
6541         dtrace_probe_t *probe = ecb->dte_probe;
6542
6543         /*
6544          * It's impossible to be taking action on the NULL probe.
6545          */
6546         ASSERT(probe != NULL);
6547
6548         if (dtrace_destructive_disallow)
6549                 return;
6550
6551         if (dtrace_panicked != NULL)
6552                 return;
6553
6554         if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
6555                 return;
6556
6557         /*
6558          * We won the right to panic.  (We want to be sure that only one
6559          * thread calls panic() from dtrace_probe(), and that panic() is
6560          * called exactly once.)
6561          */
6562         panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6563             probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6564             probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6565
6566         /*
6567          * APPLE NOTE: this was for an old Mac OS X debug feature
6568          * allowing a return from panic().  Revisit someday.
6569          */
6570         dtrace_panicked = NULL;
6571 }
6572
6573 static void
6574 dtrace_action_raise(uint64_t sig)
6575 {
6576         if (dtrace_destructive_disallow)
6577                 return;
6578
6579         if (sig >= NSIG) {
6580                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6581                 return;
6582         }
6583
6584         /*
6585          * raise() has a queue depth of 1 -- we ignore all subsequent
6586          * invocations of the raise() action.
6587          */
6588
6589         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6590
6591         if (uthread && uthread->t_dtrace_sig == 0) {
6592                 uthread->t_dtrace_sig = sig;
6593                 act_set_astbsd(current_thread());
6594         }
6595 }
6596
6597 static void
6598 dtrace_action_stop(void)
6599 {
6600         if (dtrace_destructive_disallow)
6601                 return;
6602
6603         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6604         if (uthread) {
6605                 /*
6606                  * The currently running process will be set to task_suspend
6607                  * when it next leaves the kernel.
6608                 */
6609                 uthread->t_dtrace_stop = 1;
6610                 act_set_astbsd(current_thread());
6611         }
6612 }
6613
6614
6615 /*
6616  * APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
6617  * Both activate only when the currently running process next leaves the
6618  * kernel.
6619  */
6620 static void
6621 dtrace_action_pidresume(uint64_t pid)
6622 {
6623         if (dtrace_destructive_disallow)
6624                 return;
6625
6626         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
6627                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6628                 return;
6629         }
6630         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6631
6632         /*
6633          * When the currently running process leaves the kernel, it attempts to
6634          * task_resume the process (denoted by pid), if that pid appears to have
6635          * been stopped by dtrace_action_stop().
6636          * The currently running process has a pidresume() queue depth of 1 --
6637          * subsequent invocations of the pidresume() action are ignored.
6638          */
6639
6640         if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
6641                 uthread->t_dtrace_resumepid = pid;
6642                 act_set_astbsd(current_thread());
6643         }
6644 }
6645
6646 static void
6647 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6648 {
6649         hrtime_t now;
6650         volatile uint16_t *flags;
6651         dtrace_cpu_t *cpu = CPU;
6652
6653         if (dtrace_destructive_disallow)
6654                 return;
6655
6656         flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6657
6658         now = dtrace_gethrtime();
6659
6660         if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6661                 /*
6662                  * We need to advance the mark to the current time.
6663                  */
6664                 cpu->cpu_dtrace_chillmark = now;
6665                 cpu->cpu_dtrace_chilled = 0;
6666         }
6667
6668         /*
6669          * Now check to see if the requested chill time would take us over
6670          * the maximum amount of time allowed in the chill interval.  (Or
6671          * worse, if the calculation itself induces overflow.)
6672          */
6673         if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6674             cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6675                 *flags |= CPU_DTRACE_ILLOP;
6676                 return;
6677         }
6678
6679         while (dtrace_gethrtime() - now < val)
6680                 continue;
6681
6682         /*
6683          * Normally, we assure that the value of the variable "timestamp" does
6684          * not change within an ECB.  The presence of chill() represents an
6685          * exception to this rule, however.
6686          */
6687         mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6688         cpu->cpu_dtrace_chilled += val;
6689 }
6690
6691 static void
6692 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6693     uint64_t *buf, uint64_t arg)
6694 {
6695         int nframes = DTRACE_USTACK_NFRAMES(arg);
6696         int strsize = DTRACE_USTACK_STRSIZE(arg);
6697         uint64_t *pcs = &buf[1], *fps;
6698         char *str = (char *)&pcs[nframes];
6699         int size, offs = 0, i, j;
6700         uintptr_t old = mstate->dtms_scratch_ptr, saved;
6701         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6702         char *sym;
6703
6704         /*
6705          * Should be taking a faster path if string space has not been
6706          * allocated.
6707          */
6708         ASSERT(strsize != 0);
6709
6710         /*
6711          * We will first allocate some temporary space for the frame pointers.
6712          */
6713         fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6714         size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6715             (nframes * sizeof (uint64_t));
6716
6717         if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6718                 /*
6719                  * Not enough room for our frame pointers -- need to indicate
6720                  * that we ran out of scratch space.
6721                  */
6722                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6723                 return;
6724         }
6725
6726         mstate->dtms_scratch_ptr += size;
6727         saved = mstate->dtms_scratch_ptr;
6728
6729         /*
6730          * Now get a stack with both program counters and frame pointers.
6731          */
6732         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6733         dtrace_getufpstack(buf, fps, nframes + 1);
6734         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6735
6736         /*
6737          * If that faulted, we're cooked.
6738          */
6739         if (*flags & CPU_DTRACE_FAULT)
6740                 goto out;
6741
6742         /*
6743          * Now we want to walk up the stack, calling the USTACK helper.  For
6744          * each iteration, we restore the scratch pointer.
6745          */
6746         for (i = 0; i < nframes; i++) {
6747                 mstate->dtms_scratch_ptr = saved;
6748
6749                 if (offs >= strsize)
6750                         break;
6751
6752                 sym = (char *)(uintptr_t)dtrace_helper(
6753                     DTRACE_HELPER_ACTION_USTACK,
6754                     mstate, state, pcs[i], fps[i]);
6755
6756                 /*
6757                  * If we faulted while running the helper, we're going to
6758                  * clear the fault and null out the corresponding string.
6759                  */
6760                 if (*flags & CPU_DTRACE_FAULT) {
6761                         *flags &= ~CPU_DTRACE_FAULT;
6762                         str[offs++] = '\0';
6763                         continue;
6764                 }
6765
6766                 if (sym == NULL) {
6767                         str[offs++] = '\0';
6768                         continue;
6769                 }
6770
6771                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6772
6773                 /*
6774                  * Now copy in the string that the helper returned to us.
6775                  */
6776                 for (j = 0; offs + j < strsize; j++) {
6777                         if ((str[offs + j] = sym[j]) == '\0')
6778                                 break;
6779                 }
6780
6781                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6782
6783                 offs += j + 1;
6784         }
6785
6786         if (offs >= strsize) {
6787                 /*
6788                  * If we didn't have room for all of the strings, we don't
6789                  * abort processing -- this needn't be a fatal error -- but we
6790                  * still want to increment a counter (dts_stkstroverflows) to
6791                  * allow this condition to be warned about.  (If this is from
6792                  * a jstack() action, it is easily tuned via jstackstrsize.)
6793                  */
6794                 dtrace_error(&state->dts_stkstroverflows);
6795         }
6796
6797         while (offs < strsize)
6798                 str[offs++] = '\0';
6799
6800 out:
6801         mstate->dtms_scratch_ptr = old;
6802 }
6803
6804 static void
6805 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6806     size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6807 {
6808         volatile uint16_t *flags;
6809         uint64_t val = *valp;
6810         size_t valoffs = *valoffsp;
6811
6812         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6813         ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6814
6815         /*
6816          * If this is a string, we're going to only load until we find the zero
6817          * byte -- after which we'll store zero bytes.
6818          */
6819         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6820                 char c = '\0' + 1;
6821                 size_t s;
6822
6823                 for (s = 0; s < size; s++) {
6824                         if (c != '\0' && dtkind == DIF_TF_BYREF) {
6825                                 c = dtrace_load8(val++);
6826                         } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6827                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6828                                 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6829                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6830                                 if (*flags & CPU_DTRACE_FAULT)
6831                                         break;
6832                         }
6833
6834                         DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6835
6836                         if (c == '\0' && intuple)
6837                                 break;
6838                 }
6839         } else {
6840                 uint8_t c;
6841                 while (valoffs < end) {
6842                         if (dtkind == DIF_TF_BYREF) {
6843                                 c = dtrace_load8(val++);
6844                         } else if (dtkind == DIF_TF_BYUREF) {
6845                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6846                                 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6847                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6848                                 if (*flags & CPU_DTRACE_FAULT)
6849                                         break;
6850                         }
6851
6852                         DTRACE_STORE(uint8_t, tomax,
6853                             valoffs++, c);
6854                 }
6855         }
6856
6857         *valp = val;
6858         *valoffsp = valoffs;
6859 }
6860
6861 /*
6862  * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
6863  * defined, we also assert that we are not recursing unless the probe ID is an
6864  * error probe.
6865  */
6866 static dtrace_icookie_t
6867 dtrace_probe_enter(dtrace_id_t id)
6868 {
6869         thread_t thread = current_thread();
6870         uint16_t inprobe;
6871
6872         dtrace_icookie_t cookie;
6873
6874         cookie = dtrace_interrupt_disable();
6875
6876         /*
6877          * Unless this is an ERROR probe, we are not allowed to recurse in
6878          * dtrace_probe(). Recursing into DTrace probe usually means that a
6879          * function is instrumented that should not have been instrumented or
6880          * that the ordering guarantee of the records will be violated,
6881          * resulting in unexpected output. If there is an exception to this
6882          * assertion, a new case should be added.
6883          */
6884         inprobe = dtrace_get_thread_inprobe(thread);
6885         VERIFY(inprobe == 0 ||
6886             id == dtrace_probeid_error);
6887         ASSERT(inprobe < UINT16_MAX);
6888         dtrace_set_thread_inprobe(thread, inprobe + 1);
6889
6890         return (cookie);
6891 }
6892
6893 /*
6894  * Clears the per-thread inprobe flag and enables interrupts.
6895  */
6896 static void
6897 dtrace_probe_exit(dtrace_icookie_t cookie)
6898 {
6899         thread_t thread = current_thread();
6900         uint16_t inprobe = dtrace_get_thread_inprobe(thread);
6901
6902         ASSERT(inprobe > 0);
6903         dtrace_set_thread_inprobe(thread, inprobe - 1);
6904
6905 #if INTERRUPT_MASKED_DEBUG
6906         ml_spin_debug_reset(thread);
6907 #endif /* INTERRUPT_MASKED_DEBUG */
6908
6909         dtrace_interrupt_enable(cookie);
6910 }
6911
6912 /*
6913  * If you're looking for the epicenter of DTrace, you just found it.  This
6914  * is the function called by the provider to fire a probe -- from which all
6915  * subsequent probe-context DTrace activity emanates.
6916  */
6917 void
6918 dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6919     uint64_t arg2, uint64_t arg3, uint64_t arg4)
6920 {
6921         processorid_t cpuid;
6922         dtrace_icookie_t cookie;
6923         dtrace_probe_t *probe;
6924         dtrace_mstate_t mstate;
6925         dtrace_ecb_t *ecb;
6926         dtrace_action_t *act;
6927         intptr_t offs;
6928         size_t size;
6929         int vtime, onintr;
6930         volatile uint16_t *flags;
6931         hrtime_t now;
6932
6933         cookie = dtrace_probe_enter(id);
6934         probe = dtrace_probes[id - 1];
6935         cpuid = CPU->cpu_id;
6936         onintr = CPU_ON_INTR(CPU);
6937
6938         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6939             probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
6940                 /*
6941                  * We have hit in the predicate cache; we know that
6942                  * this predicate would evaluate to be false.
6943                  */
6944                 dtrace_probe_exit(cookie);
6945                 return;
6946         }
6947
6948         if (panic_quiesce) {
6949                 /*
6950                  * We don't trace anything if we're panicking.
6951                  */
6952                 dtrace_probe_exit(cookie);
6953                 return;
6954         }
6955
6956 #if !defined(__APPLE__)
6957         now = dtrace_gethrtime();
6958         vtime = dtrace_vtime_references != 0;
6959
6960         if (vtime && curthread->t_dtrace_start)
6961                 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6962 #else
6963         /*
6964          * APPLE NOTE:  The time spent entering DTrace and arriving
6965          * to this point, is attributed to the current thread.
6966          * Instead it should accrue to DTrace.  FIXME
6967          */
6968         vtime = dtrace_vtime_references != 0;
6969
6970         if (vtime)
6971         {
6972                 int64_t dtrace_accum_time, recent_vtime;
6973                 thread_t thread = current_thread();
6974
6975                 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
6976
6977                 if (dtrace_accum_time >= 0) {
6978                         recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
6979
6980                         recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
6981
6982                         dtrace_set_thread_vtime(thread, recent_vtime);
6983                 }
6984         }
6985
6986         now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
6987 #endif /* __APPLE__ */
6988
6989         /*
6990          * APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
6991          * dtrace_probe() in some circumstances.   See, e.g. fasttrap_isa.c.
6992          * However the provider has no access to ECB context, so passes
6993          * 0 through "arg0" and the probe_id of the overridden probe as arg1.
6994          * Detect that here and cons up a viable state (from the probe_id).
6995          */
6996         if (dtrace_probeid_error == id && 0 == arg0) {
6997                 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
6998                 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
6999                 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
7000
7001                 if (NULL != ftp_ecb) {
7002                         dtrace_state_t *ftp_state = ftp_ecb->dte_state;
7003
7004                         arg0 = (uint64_t)(uintptr_t)ftp_state;
7005                         arg1 = ftp_ecb->dte_epid;
7006                         /*
7007                          * args[2-4] established by caller.
7008                          */
7009                         ftp_state->dts_arg_error_illval = -1; /* arg5 */
7010                 }
7011         }
7012
7013         mstate.dtms_difo = NULL;
7014         mstate.dtms_probe = probe;
7015         mstate.dtms_strtok = 0;
7016         mstate.dtms_arg[0] = arg0;
7017         mstate.dtms_arg[1] = arg1;
7018         mstate.dtms_arg[2] = arg2;
7019         mstate.dtms_arg[3] = arg3;
7020         mstate.dtms_arg[4] = arg4;
7021
7022         flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7023
7024         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7025                 dtrace_predicate_t *pred = ecb->dte_predicate;
7026                 dtrace_state_t *state = ecb->dte_state;
7027                 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7028                 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7029                 dtrace_vstate_t *vstate = &state->dts_vstate;
7030                 dtrace_provider_t *prov = probe->dtpr_provider;
7031                 uint64_t tracememsize = 0;
7032                 int committed = 0;
7033                 caddr_t tomax;
7034
7035                 /*
7036                  * A little subtlety with the following (seemingly innocuous)
7037                  * declaration of the automatic 'val':  by looking at the
7038                  * code, you might think that it could be declared in the
7039                  * action processing loop, below.  (That is, it's only used in
7040                  * the action processing loop.)  However, it must be declared
7041                  * out of that scope because in the case of DIF expression
7042                  * arguments to aggregating actions, one iteration of the
7043                  * action loop will use the last iteration's value.
7044                  */
7045 #ifdef lint
7046                 uint64_t val = 0;
7047 #else
7048                 uint64_t val = 0;
7049 #endif
7050
7051                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7052                 *flags &= ~CPU_DTRACE_ERROR;
7053
7054                 if (prov == dtrace_provider) {
7055                         /*
7056                          * If dtrace itself is the provider of this probe,
7057                          * we're only going to continue processing the ECB if
7058                          * arg0 (the dtrace_state_t) is equal to the ECB's
7059                          * creating state.  (This prevents disjoint consumers
7060                          * from seeing one another's metaprobes.)
7061                          */
7062                         if (arg0 != (uint64_t)(uintptr_t)state)
7063                                 continue;
7064                 }
7065
7066                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7067                         /*
7068                          * We're not currently active.  If our provider isn't
7069                          * the dtrace pseudo provider, we're not interested.
7070                          */
7071                         if (prov != dtrace_provider)
7072                                 continue;
7073
7074                         /*
7075                          * Now we must further check if we are in the BEGIN
7076                          * probe.  If we are, we will only continue processing
7077                          * if we're still in WARMUP -- if one BEGIN enabling
7078                          * has invoked the exit() action, we don't want to
7079                          * evaluate subsequent BEGIN enablings.
7080                          */
7081                         if (probe->dtpr_id == dtrace_probeid_begin &&
7082                             state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7083                                 ASSERT(state->dts_activity ==
7084                                     DTRACE_ACTIVITY_DRAINING);
7085                                 continue;
7086                         }
7087                 }
7088
7089                 if (ecb->dte_cond) {
7090                         /*
7091                          * If the dte_cond bits indicate that this
7092                          * consumer is only allowed to see user-mode firings
7093                          * of this probe, call the provider's dtps_usermode()
7094                          * entry point to check that the probe was fired
7095                          * while in a user context. Skip this ECB if that's
7096                          * not the case.
7097                          */
7098                         if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7099                             prov->dtpv_pops.dtps_usermode &&
7100                             prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7101                             probe->dtpr_id, probe->dtpr_arg) == 0)
7102                                 continue;
7103
7104                         /*
7105                          * This is more subtle than it looks. We have to be
7106                          * absolutely certain that CRED() isn't going to
7107                          * change out from under us so it's only legit to
7108                          * examine that structure if we're in constrained
7109                          * situations. Currently, the only times we'll this
7110                          * check is if a non-super-user has enabled the
7111                          * profile or syscall providers -- providers that
7112                          * allow visibility of all processes. For the
7113                          * profile case, the check above will ensure that
7114                          * we're examining a user context.
7115                          */
7116                         if (ecb->dte_cond & DTRACE_COND_OWNER) {
7117                                 cred_t *cr;
7118                                 cred_t *s_cr =
7119                                     ecb->dte_state->dts_cred.dcr_cred;
7120                                 proc_t *proc;
7121 #pragma unused(proc) /* __APPLE__ */
7122
7123                                 ASSERT(s_cr != NULL);
7124
7125                         /*
7126                          * XXX this is hackish, but so is setting a variable
7127                          * XXX in a McCarthy OR...
7128                          */
7129                                 if ((cr = dtrace_CRED()) == NULL ||
7130                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
7131                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
7132                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
7133                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
7134                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
7135                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
7136 #if !defined(__APPLE__)
7137                                     (proc = ttoproc(curthread)) == NULL ||
7138                                     (proc->p_flag & SNOCD))
7139 #else
7140                                         1) /* APPLE NOTE: Darwin omits "No Core Dump" flag */
7141 #endif /* __APPLE__ */
7142                                         continue;
7143                         }
7144
7145                         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7146                                 cred_t *cr;
7147                                 cred_t *s_cr =
7148                                     ecb->dte_state->dts_cred.dcr_cred;
7149 #pragma unused(cr, s_cr) /* __APPLE__ */
7150
7151                                 ASSERT(s_cr != NULL);
7152
7153 #if !defined(__APPLE__)
7154                                 if ((cr = CRED()) == NULL ||
7155                                     s_cr->cr_zone->zone_id !=
7156                                     cr->cr_zone->zone_id)
7157                                         continue;
7158 #else
7159                                 /* APPLE NOTE: Darwin doesn't do zones. */
7160 #endif /* __APPLE__ */
7161                         }
7162                 }
7163
7164                 if (now - state->dts_alive > dtrace_deadman_timeout) {
7165                         /*
7166                          * We seem to be dead.  Unless we (a) have kernel
7167                          * destructive permissions (b) have expicitly enabled
7168                          * destructive actions and (c) destructive actions have
7169                          * not been disabled, we're going to transition into
7170                          * the KILLED state, from which no further processing
7171                          * on this state will be performed.
7172                          */
7173                         if (!dtrace_priv_kernel_destructive(state) ||
7174                             !state->dts_cred.dcr_destructive ||
7175                             dtrace_destructive_disallow) {
7176                                 void *activity = &state->dts_activity;
7177                                 dtrace_activity_t current;
7178
7179                                 do {
7180                                         current = state->dts_activity;
7181                                 } while (dtrace_cas32(activity, current,
7182                                     DTRACE_ACTIVITY_KILLED) != current);
7183
7184                                 continue;
7185                         }
7186                 }
7187
7188                 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7189                     ecb->dte_alignment, state, &mstate)) < 0)
7190                         continue;
7191
7192                 tomax = buf->dtb_tomax;
7193                 ASSERT(tomax != NULL);
7194
7195                 /*
7196                  * Build and store the record header corresponding to the ECB.
7197                  */
7198                 if (ecb->dte_size != 0) {
7199                         dtrace_rechdr_t dtrh;
7200
7201                         if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7202                                 mstate.dtms_timestamp = dtrace_gethrtime();
7203                                 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7204                         }
7205
7206                         ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7207
7208                         dtrh.dtrh_epid = ecb->dte_epid;
7209                         DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
7210                         DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
7211                 }
7212
7213                 mstate.dtms_epid = ecb->dte_epid;
7214                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
7215
7216                 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7217                         mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7218                 else
7219                         mstate.dtms_access = 0;
7220
7221                 if (pred != NULL) {
7222                         dtrace_difo_t *dp = pred->dtp_difo;
7223                         uint64_t rval;
7224
7225                         rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7226
7227                         if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7228                                 dtrace_cacheid_t cid = probe->dtpr_predcache;
7229
7230                                 if (cid != DTRACE_CACHEIDNONE && !onintr) {
7231                                         /*
7232                                          * Update the predicate cache...
7233                                          */
7234                                         ASSERT(cid == pred->dtp_cacheid);
7235
7236                                         dtrace_set_thread_predcache(current_thread(), cid);
7237                                 }
7238
7239                                 continue;
7240                         }
7241                 }
7242
7243                 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7244                     act != NULL; act = act->dta_next) {
7245                         size_t valoffs;
7246                         dtrace_difo_t *dp;
7247                         dtrace_recdesc_t *rec = &act->dta_rec;
7248
7249                         size = rec->dtrd_size;
7250                         valoffs = offs + rec->dtrd_offset;
7251
7252                         if (DTRACEACT_ISAGG(act->dta_kind)) {
7253                                 uint64_t v = 0xbad;
7254                                 dtrace_aggregation_t *agg;
7255
7256                                 agg = (dtrace_aggregation_t *)act;
7257
7258                                 if ((dp = act->dta_difo) != NULL)
7259                                         v = dtrace_dif_emulate(dp,
7260                                             &mstate, vstate, state);
7261
7262                                 if (*flags & CPU_DTRACE_ERROR)
7263                                         continue;
7264
7265                                 /*
7266                                  * Note that we always pass the expression
7267                                  * value from the previous iteration of the
7268                                  * action loop.  This value will only be used
7269                                  * if there is an expression argument to the
7270                                  * aggregating action, denoted by the
7271                                  * dtag_hasarg field.
7272                                  */
7273                                 dtrace_aggregate(agg, buf,
7274                                     offs, aggbuf, v, val);
7275                                 continue;
7276                         }
7277
7278                         switch (act->dta_kind) {
7279                         case DTRACEACT_STOP:
7280                                 if (dtrace_priv_proc_destructive(state))
7281                                         dtrace_action_stop();
7282                                 continue;
7283
7284                         case DTRACEACT_BREAKPOINT:
7285                                 if (dtrace_priv_kernel_destructive(state))
7286                                         dtrace_action_breakpoint(ecb);
7287                                 continue;
7288
7289                         case DTRACEACT_PANIC:
7290                                 if (dtrace_priv_kernel_destructive(state))
7291                                         dtrace_action_panic(ecb);
7292                                 continue;
7293
7294                         case DTRACEACT_STACK:
7295                                 if (!dtrace_priv_kernel(state))
7296                                         continue;
7297
7298                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
7299                                     size / sizeof (pc_t), probe->dtpr_aframes,
7300                                     DTRACE_ANCHORED(probe) ? NULL :
7301                                   (uint32_t *)(uintptr_t)arg0);
7302                                 continue;
7303
7304                         case DTRACEACT_JSTACK:
7305                         case DTRACEACT_USTACK:
7306                                 if (!dtrace_priv_proc(state))
7307                                         continue;
7308
7309                                 /*
7310                                  * See comment in DIF_VAR_PID.
7311                                  */
7312                                 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7313                                     CPU_ON_INTR(CPU)) {
7314                                         int depth = DTRACE_USTACK_NFRAMES(
7315                                             rec->dtrd_arg) + 1;
7316
7317                                         dtrace_bzero((void *)(tomax + valoffs),
7318                                             DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7319                                             + depth * sizeof (uint64_t));
7320
7321                                         continue;
7322                                 }
7323
7324                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7325                                     curproc->p_dtrace_helpers != NULL) {
7326                                         /*
7327                                          * This is the slow path -- we have
7328                                          * allocated string space, and we're
7329                                          * getting the stack of a process that
7330                                          * has helpers.  Call into a separate
7331                                          * routine to perform this processing.
7332                                          */
7333                                         dtrace_action_ustack(&mstate, state,
7334                                             (uint64_t *)(tomax + valoffs),
7335                                             rec->dtrd_arg);
7336                                         continue;
7337                                 }
7338
7339                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7340                                 dtrace_getupcstack((uint64_t *)
7341                                     (tomax + valoffs),
7342                                     DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7343                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7344                                 continue;
7345
7346                         default:
7347                                 break;
7348                         }
7349
7350                         dp = act->dta_difo;
7351                         ASSERT(dp != NULL);
7352
7353                         val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7354
7355                         if (*flags & CPU_DTRACE_ERROR)
7356                                 continue;
7357
7358                         switch (act->dta_kind) {
7359                         case DTRACEACT_SPECULATE: {
7360                                 dtrace_rechdr_t *dtrh = NULL;
7361
7362                                 ASSERT(buf == &state->dts_buffer[cpuid]);
7363                                 buf = dtrace_speculation_buffer(state,
7364                                     cpuid, val);
7365
7366                                 if (buf == NULL) {
7367                                         *flags |= CPU_DTRACE_DROP;
7368                                         continue;
7369                                 }
7370
7371                                 offs = dtrace_buffer_reserve(buf,
7372                                     ecb->dte_needed, ecb->dte_alignment,
7373                                     state, NULL);
7374
7375                                 if (offs < 0) {
7376                                         *flags |= CPU_DTRACE_DROP;
7377                                         continue;
7378                                 }
7379
7380                                 tomax = buf->dtb_tomax;
7381                                 ASSERT(tomax != NULL);
7382
7383                                 if (ecb->dte_size == 0)
7384                                         continue;
7385
7386                                 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7387                                 dtrh = ((void *)(tomax + offs));
7388                                 dtrh->dtrh_epid = ecb->dte_epid;
7389
7390                                 /*
7391                                  * When the speculation is committed, all of
7392                                  * the records in the speculative buffer will
7393                                  * have their timestamps set to the commit
7394                                  * time.  Until then, it is set to a sentinel
7395                                  * value, for debugability.
7396                                  */
7397                                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7398
7399                                 continue;
7400                         }
7401
7402                         case DTRACEACT_CHILL:
7403                                 if (dtrace_priv_kernel_destructive(state))
7404                                         dtrace_action_chill(&mstate, val);
7405                                 continue;
7406
7407                         case DTRACEACT_RAISE:
7408                                 if (dtrace_priv_proc_destructive(state))
7409                                         dtrace_action_raise(val);
7410                                 continue;
7411
7412                         case DTRACEACT_PIDRESUME:   /* __APPLE__ */
7413                                 if (dtrace_priv_proc_destructive(state))
7414                                         dtrace_action_pidresume(val);
7415                                 continue;
7416
7417                         case DTRACEACT_COMMIT:
7418                                 ASSERT(!committed);
7419
7420                                 /*
7421                                  * We need to commit our buffer state.
7422                                  */
7423                                 if (ecb->dte_size)
7424                                         buf->dtb_offset = offs + ecb->dte_size;
7425                                 buf = &state->dts_buffer[cpuid];
7426                                 dtrace_speculation_commit(state, cpuid, val);
7427                                 committed = 1;
7428                                 continue;
7429
7430                         case DTRACEACT_DISCARD:
7431                                 dtrace_speculation_discard(state, cpuid, val);
7432                                 continue;
7433
7434                         case DTRACEACT_DIFEXPR:
7435                         case DTRACEACT_LIBACT:
7436                         case DTRACEACT_PRINTF:
7437                         case DTRACEACT_PRINTA:
7438                         case DTRACEACT_SYSTEM:
7439                         case DTRACEACT_FREOPEN:
7440                         case DTRACEACT_APPLEBINARY:   /* __APPLE__ */
7441                         case DTRACEACT_TRACEMEM:
7442                                 break;
7443
7444                         case DTRACEACT_TRACEMEM_DYNSIZE:
7445                                 tracememsize = val;
7446                                 break;
7447
7448                         case DTRACEACT_SYM:
7449                         case DTRACEACT_MOD:
7450                                 if (!dtrace_priv_kernel(state))
7451                                         continue;
7452                                 break;
7453
7454                         case DTRACEACT_USYM:
7455                         case DTRACEACT_UMOD:
7456                         case DTRACEACT_UADDR: {
7457                                 if (!dtrace_priv_proc(state))
7458                                         continue;
7459
7460                                 DTRACE_STORE(uint64_t, tomax,
7461                                     valoffs, (uint64_t)dtrace_proc_selfpid());
7462                                 DTRACE_STORE(uint64_t, tomax,
7463                                     valoffs + sizeof (uint64_t), val);
7464
7465                                 continue;
7466                         }
7467
7468                         case DTRACEACT_EXIT: {
7469                                 /*
7470                                  * For the exit action, we are going to attempt
7471                                  * to atomically set our activity to be
7472                                  * draining.  If this fails (either because
7473                                  * another CPU has beat us to the exit action,
7474                                  * or because our current activity is something
7475                                  * other than ACTIVE or WARMUP), we will
7476                                  * continue.  This assures that the exit action
7477                                  * can be successfully recorded at most once
7478                                  * when we're in the ACTIVE state.  If we're
7479                                  * encountering the exit() action while in
7480                                  * COOLDOWN, however, we want to honor the new
7481                                  * status code.  (We know that we're the only
7482                                  * thread in COOLDOWN, so there is no race.)
7483                                  */
7484                                 void *activity = &state->dts_activity;
7485                                 dtrace_activity_t current = state->dts_activity;
7486
7487                                 if (current == DTRACE_ACTIVITY_COOLDOWN)
7488                                         break;
7489
7490                                 if (current != DTRACE_ACTIVITY_WARMUP)
7491                                         current = DTRACE_ACTIVITY_ACTIVE;
7492
7493                                 if (dtrace_cas32(activity, current,
7494                                     DTRACE_ACTIVITY_DRAINING) != current) {
7495                                         *flags |= CPU_DTRACE_DROP;
7496                                         continue;
7497                                 }
7498
7499                                 break;
7500                         }
7501
7502                         default:
7503                                 ASSERT(0);
7504                         }
7505
7506                         if (dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF)) {
7507                                 uintptr_t end = valoffs + size;
7508
7509                                 if (tracememsize != 0 &&
7510                                     valoffs + tracememsize < end)
7511                                 {
7512                                         end = valoffs + tracememsize;
7513                                         tracememsize = 0;
7514                                 }
7515
7516                                 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7517                                     !dtrace_vcanload((void *)(uintptr_t)val,
7518                                     &dp->dtdo_rtype, NULL, &mstate, vstate))
7519                                 {
7520                                         continue;
7521                                 }
7522
7523                                 dtrace_store_by_ref(dp, tomax, size, &valoffs,
7524                                     &val, end, act->dta_intuple,
7525                                     dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7526                                     DIF_TF_BYREF: DIF_TF_BYUREF);
7527
7528                                 continue;
7529                         }
7530
7531                         switch (size) {
7532                         case 0:
7533                                 break;
7534
7535                         case sizeof (uint8_t):
7536                                 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7537                                 break;
7538                         case sizeof (uint16_t):
7539                                 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7540                                 break;
7541                         case sizeof (uint32_t):
7542                                 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7543                                 break;
7544                         case sizeof (uint64_t):
7545                                 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7546                                 break;
7547                         default:
7548                                 /*
7549                                  * Any other size should have been returned by
7550                                  * reference, not by value.
7551                                  */
7552                                 ASSERT(0);
7553                                 break;
7554                         }
7555                 }
7556
7557                 if (*flags & CPU_DTRACE_DROP)
7558                         continue;
7559
7560                 if (*flags & CPU_DTRACE_FAULT) {
7561                         int ndx;
7562                         dtrace_action_t *err;
7563
7564                         buf->dtb_errors++;
7565
7566                         if (probe->dtpr_id == dtrace_probeid_error) {
7567                                 /*
7568                                  * There's nothing we can do -- we had an
7569                                  * error on the error probe.  We bump an
7570                                  * error counter to at least indicate that
7571                                  * this condition happened.
7572                                  */
7573                                 dtrace_error(&state->dts_dblerrors);
7574                                 continue;
7575                         }
7576
7577                         if (vtime) {
7578                                 /*
7579                                  * Before recursing on dtrace_probe(), we
7580                                  * need to explicitly clear out our start
7581                                  * time to prevent it from being accumulated
7582                                  * into t_dtrace_vtime.
7583                                  */
7584
7585                                 /*
7586                                  * Darwin sets the sign bit on t_dtrace_tracing
7587                                  * to suspend accumulation to it.
7588                                  */
7589                                 dtrace_set_thread_tracing(current_thread(),
7590                                     (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
7591
7592                         }
7593
7594                         /*
7595                          * Iterate over the actions to figure out which action
7596                          * we were processing when we experienced the error.
7597                          * Note that act points _past_ the faulting action; if
7598                          * act is ecb->dte_action, the fault was in the
7599                          * predicate, if it's ecb->dte_action->dta_next it's
7600                          * in action #1, and so on.
7601                          */
7602                         for (err = ecb->dte_action, ndx = 0;
7603                             err != act; err = err->dta_next, ndx++)
7604                                 continue;
7605
7606                         dtrace_probe_error(state, ecb->dte_epid, ndx,
7607                             (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7608                             mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7609                             cpu_core[cpuid].cpuc_dtrace_illval);
7610
7611                         continue;
7612                 }
7613
7614                 if (!committed)
7615                         buf->dtb_offset = offs + ecb->dte_size;
7616         }
7617
7618         /* FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed
7619            to the current thread. Instead it should accrue to DTrace. */
7620         if (vtime) {
7621                 thread_t thread = current_thread();
7622                 int64_t t = dtrace_get_thread_tracing(thread);
7623
7624                 if (t >= 0) {
7625                         /* Usual case, accumulate time spent here into t_dtrace_tracing */
7626                         dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
7627                 } else {
7628                         /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
7629                         dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
7630                 }
7631         }
7632
7633         dtrace_probe_exit(cookie);
7634 }
7635
7636 /*
7637  * DTrace Probe Hashing Functions
7638  *
7639  * The functions in this section (and indeed, the functions in remaining
7640  * sections) are not _called_ from probe context.  (Any exceptions to this are
7641  * marked with a "Note:".)  Rather, they are called from elsewhere in the
7642  * DTrace framework to look-up probes in, add probes to and remove probes from
7643  * the DTrace probe hashes.  (Each probe is hashed by each element of the
7644  * probe tuple -- allowing for fast lookups, regardless of what was
7645  * specified.)
7646  */
7647 static uint_t
7648 dtrace_hash_str(const char *p)
7649 {
7650         unsigned int g;
7651         uint_t hval = 0;
7652
7653         while (*p) {
7654                 hval = (hval << 4) + *p++;
7655                 if ((g = (hval & 0xf0000000)) != 0)
7656                         hval ^= g >> 24;
7657                 hval &= ~g;
7658         }
7659         return (hval);
7660 }
7661
7662 static const char*
7663 dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
7664 {
7665 #pragma unused(offs)
7666         dtrace_probe_t *probe = (dtrace_probe_t*)elm;
7667         return probe->dtpr_provider->dtpv_name;
7668 }
7669
7670 static const char*
7671 dtrace_strkey_offset(void *elm, uintptr_t offs)
7672 {
7673         return ((char *)((uintptr_t)(elm) + offs));
7674 }
7675
7676 static const char*
7677 dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
7678 {
7679         return *((char **)((uintptr_t)(elm) + offs));
7680 }
7681
7682 static dtrace_hash_t *
7683 dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
7684 {
7685         dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7686
7687         hash->dth_getstr = func;
7688         hash->dth_stroffs = arg;
7689         hash->dth_nextoffs = nextoffs;
7690         hash->dth_prevoffs = prevoffs;
7691
7692         hash->dth_size = 1;
7693         hash->dth_mask = hash->dth_size - 1;
7694
7695         hash->dth_tab = kmem_zalloc(hash->dth_size *
7696             sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7697
7698         return (hash);
7699 }
7700
7701 /*
7702  * APPLE NOTE: dtrace_hash_destroy is not used.
7703  * It is called by dtrace_detach which is not
7704  * currently implemented.  Revisit someday.
7705  */
7706 #if !defined(__APPLE__)
7707 static void
7708 dtrace_hash_destroy(dtrace_hash_t *hash)
7709 {
7710 #if DEBUG
7711         int i;
7712
7713         for (i = 0; i < hash->dth_size; i++)
7714                 ASSERT(hash->dth_tab[i] == NULL);
7715 #endif
7716
7717         kmem_free(hash->dth_tab,
7718             hash->dth_size * sizeof (dtrace_hashbucket_t *));
7719         kmem_free(hash, sizeof (dtrace_hash_t));
7720 }
7721 #endif /* __APPLE__ */
7722
7723 static void
7724 dtrace_hash_resize(dtrace_hash_t *hash)
7725 {
7726         int size = hash->dth_size, i, ndx;
7727         int new_size = hash->dth_size << 1;
7728         int new_mask = new_size - 1;
7729         dtrace_hashbucket_t **new_tab, *bucket, *next;
7730
7731         ASSERT((new_size & new_mask) == 0);
7732
7733         new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7734
7735         for (i = 0; i < size; i++) {
7736                 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7737                         void *elm = bucket->dthb_chain;
7738
7739                         ASSERT(elm != NULL);
7740                         ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
7741
7742                         next = bucket->dthb_next;
7743                         bucket->dthb_next = new_tab[ndx];
7744                         new_tab[ndx] = bucket;
7745                 }
7746         }
7747
7748         kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7749         hash->dth_tab = new_tab;
7750         hash->dth_size = new_size;
7751         hash->dth_mask = new_mask;
7752 }
7753
7754 static void
7755 dtrace_hash_add(dtrace_hash_t *hash, void *new)
7756 {
7757         int hashval = DTRACE_HASHSTR(hash, new);
7758         int ndx = hashval & hash->dth_mask;
7759         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7760         void **nextp, **prevp;
7761
7762         for (; bucket != NULL; bucket = bucket->dthb_next) {
7763                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7764                         goto add;
7765         }
7766
7767         if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7768                 dtrace_hash_resize(hash);
7769                 dtrace_hash_add(hash, new);
7770                 return;
7771         }
7772
7773         bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7774         bucket->dthb_next = hash->dth_tab[ndx];
7775         hash->dth_tab[ndx] = bucket;
7776         hash->dth_nbuckets++;
7777
7778 add:
7779         nextp = DTRACE_HASHNEXT(hash, new);
7780         ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7781         *nextp = bucket->dthb_chain;
7782
7783         if (bucket->dthb_chain != NULL) {
7784                 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7785                 ASSERT(*prevp == NULL);
7786                 *prevp = new;
7787         }
7788
7789         bucket->dthb_chain = new;
7790         bucket->dthb_len++;
7791 }
7792
7793 static void *
7794 dtrace_hash_lookup_string(dtrace_hash_t *hash, const char *str)
7795 {
7796         int hashval = dtrace_hash_str(str);
7797         int ndx = hashval & hash->dth_mask;
7798         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7799
7800         for (; bucket != NULL; bucket = bucket->dthb_next) {
7801                 if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == 0)
7802                         return (bucket->dthb_chain);
7803         }
7804
7805         return (NULL);
7806 }
7807
7808 static dtrace_probe_t *
7809 dtrace_hash_lookup(dtrace_hash_t *hash, void *template)
7810 {
7811         return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
7812 }
7813
7814 static int
7815 dtrace_hash_collisions(dtrace_hash_t *hash, void *template)
7816 {
7817         int hashval = DTRACE_HASHSTR(hash, template);
7818         int ndx = hashval & hash->dth_mask;
7819         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7820
7821         for (; bucket != NULL; bucket = bucket->dthb_next) {
7822                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7823                         return (bucket->dthb_len);
7824         }
7825
7826         return (0);
7827 }
7828
7829 static void
7830 dtrace_hash_remove(dtrace_hash_t *hash, void *elm)
7831 {
7832         int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
7833         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7834
7835         void **prevp = DTRACE_HASHPREV(hash, elm);
7836         void **nextp = DTRACE_HASHNEXT(hash, elm);
7837
7838         /*
7839          * Find the bucket that we're removing this elm from.
7840          */
7841         for (; bucket != NULL; bucket = bucket->dthb_next) {
7842                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
7843                         break;
7844         }
7845
7846         ASSERT(bucket != NULL);
7847
7848         if (*prevp == NULL) {
7849                 if (*nextp == NULL) {
7850                         /*
7851                          * The removed element was the only element on this
7852                          * bucket; we need to remove the bucket.
7853                          */
7854                         dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7855
7856                         ASSERT(bucket->dthb_chain == elm);
7857                         ASSERT(b != NULL);
7858
7859                         if (b == bucket) {
7860                                 hash->dth_tab[ndx] = bucket->dthb_next;
7861                         } else {
7862                                 while (b->dthb_next != bucket)
7863                                         b = b->dthb_next;
7864                                 b->dthb_next = bucket->dthb_next;
7865                         }
7866
7867                         ASSERT(hash->dth_nbuckets > 0);
7868                         hash->dth_nbuckets--;
7869                         kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7870                         return;
7871                 }
7872
7873                 bucket->dthb_chain = *nextp;
7874         } else {
7875                 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7876         }
7877
7878         if (*nextp != NULL)
7879                 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7880 }
7881
7882 /*
7883  * DTrace Utility Functions
7884  *
7885  * These are random utility functions that are _not_ called from probe context.
7886  */
7887 static int
7888 dtrace_badattr(const dtrace_attribute_t *a)
7889 {
7890         return (a->dtat_name > DTRACE_STABILITY_MAX ||
7891             a->dtat_data > DTRACE_STABILITY_MAX ||
7892             a->dtat_class > DTRACE_CLASS_MAX);
7893 }
7894
7895 /*
7896  * Returns a dtrace-managed copy of a string, and will
7897  * deduplicate copies of the same string.
7898  * If the specified string is NULL, returns an empty string
7899  */
7900 static char *
7901 dtrace_strref(const char *str)
7902 {
7903         dtrace_string_t *s = NULL;
7904         size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
7905
7906         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7907
7908         if (str == NULL)
7909                 str = "";
7910
7911         for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7912              s = *(DTRACE_HASHNEXT(dtrace_strings, s)))  {
7913                 if (strncmp(str, s->dtst_str, bufsize) != 0) {
7914                         continue;
7915                 }
7916                 ASSERT(s->dtst_refcount != UINT32_MAX);
7917                 s->dtst_refcount++;
7918                 return s->dtst_str;
7919         }
7920
7921         s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
7922         s->dtst_refcount = 1;
7923         (void) strlcpy(s->dtst_str, str, bufsize);
7924
7925         dtrace_hash_add(dtrace_strings, s);
7926
7927         return s->dtst_str;
7928 }
7929
7930 static void
7931 dtrace_strunref(const char *str)
7932 {
7933         ASSERT(str != NULL);
7934         dtrace_string_t *s = NULL;
7935         size_t bufsize = strlen(str) + 1;
7936
7937         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7938
7939         for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7940              s = *(DTRACE_HASHNEXT(dtrace_strings, s)))  {
7941                 if (strncmp(str, s->dtst_str, bufsize) != 0) {
7942                         continue;
7943                 }
7944                 ASSERT(s->dtst_refcount != 0);
7945                 s->dtst_refcount--;
7946                 if (s->dtst_refcount == 0) {
7947                         dtrace_hash_remove(dtrace_strings, s);
7948                         kmem_free(s, sizeof(dtrace_string_t) + bufsize);
7949                 }
7950                 return;
7951         }
7952         panic("attempt to unref non-existent string %s", str);
7953 }
7954
7955 #define DTRACE_ISALPHA(c)       \
7956         (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7957
7958 static int
7959 dtrace_badname(const char *s)
7960 {
7961         char c;
7962
7963         if (s == NULL || (c = *s++) == '\0')
7964                 return (0);
7965
7966         if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7967                 return (1);
7968
7969         while ((c = *s++) != '\0') {
7970                 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7971                     c != '-' && c != '_' && c != '.' && c != '`')
7972                         return (1);
7973         }
7974
7975         return (0);
7976 }
7977
7978 static void
7979 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7980 {
7981         uint32_t priv;
7982
7983         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7984                 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
7985                         priv = DTRACE_PRIV_USER | DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER;
7986                 }
7987                 else {
7988                         priv = DTRACE_PRIV_ALL;
7989                 }
7990                 *uidp = 0;
7991                 *zoneidp = 0;
7992         } else {
7993                 *uidp = crgetuid(cr);
7994                 *zoneidp = crgetzoneid(cr);
7995
7996                 priv = 0;
7997                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7998                         priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7999                 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8000                         priv |= DTRACE_PRIV_USER;
8001                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8002                         priv |= DTRACE_PRIV_PROC;
8003                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8004                         priv |= DTRACE_PRIV_OWNER;
8005                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8006                         priv |= DTRACE_PRIV_ZONEOWNER;
8007         }
8008
8009         *privp = priv;
8010 }
8011
8012 #ifdef DTRACE_ERRDEBUG
8013 static void
8014 dtrace_errdebug(const char *str)
8015 {
8016         int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8017         int occupied = 0;
8018
8019         lck_mtx_lock(&dtrace_errlock);
8020         dtrace_errlast = str;
8021         dtrace_errthread = (kthread_t *)current_thread();
8022
8023         while (occupied++ < DTRACE_ERRHASHSZ) {
8024                 if (dtrace_errhash[hval].dter_msg == str) {
8025                         dtrace_errhash[hval].dter_count++;
8026                         goto out;
8027                 }
8028
8029                 if (dtrace_errhash[hval].dter_msg != NULL) {
8030                         hval = (hval + 1) % DTRACE_ERRHASHSZ;
8031                         continue;
8032                 }
8033
8034                 dtrace_errhash[hval].dter_msg = str;
8035                 dtrace_errhash[hval].dter_count = 1;
8036                 goto out;
8037         }
8038
8039         panic("dtrace: undersized error hash");
8040 out:
8041         lck_mtx_unlock(&dtrace_errlock);
8042 }
8043 #endif
8044
8045 /*
8046  * DTrace Matching Functions
8047  *
8048  * These functions are used to match groups of probes, given some elements of
8049  * a probe tuple, or some globbed expressions for elements of a probe tuple.
8050  */
8051 static int
8052 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8053     zoneid_t zoneid)
8054 {
8055         if (priv != DTRACE_PRIV_ALL) {
8056                 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8057                 uint32_t match = priv & ppriv;
8058
8059                 /*
8060                  * No PRIV_DTRACE_* privileges...
8061                  */
8062                 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8063                     DTRACE_PRIV_KERNEL)) == 0)
8064                         return (0);
8065
8066                 /*
8067                  * No matching bits, but there were bits to match...
8068                  */
8069                 if (match == 0 && ppriv != 0)
8070                         return (0);
8071
8072                 /*
8073                  * Need to have permissions to the process, but don't...
8074                  */
8075                 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8076                     uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8077                         return (0);
8078                 }
8079
8080                 /*
8081                  * Need to be in the same zone unless we possess the
8082                  * privilege to examine all zones.
8083                  */
8084                 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8085                     zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8086                         return (0);
8087                 }
8088         }
8089
8090         return (1);
8091 }
8092
8093 /*
8094  * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8095  * consists of input pattern strings and an ops-vector to evaluate them.
8096  * This function returns >0 for match, 0 for no match, and <0 for error.
8097  */
8098 static int
8099 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8100     uint32_t priv, uid_t uid, zoneid_t zoneid)
8101 {
8102         dtrace_provider_t *pvp = prp->dtpr_provider;
8103         int rv;
8104
8105         if (pvp->dtpv_defunct)
8106                 return (0);
8107
8108         if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8109                 return (rv);
8110
8111         if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8112                 return (rv);
8113
8114         if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8115                 return (rv);
8116
8117         if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8118                 return (rv);
8119
8120         if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8121                 return (0);
8122
8123         return (rv);
8124 }
8125
8126 /*
8127  * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8128  * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
8129  * libc's version, the kernel version only applies to 8-bit ASCII strings.
8130  * In addition, all of the recursion cases except for '*' matching have been
8131  * unwound.  For '*', we still implement recursive evaluation, but a depth
8132  * counter is maintained and matching is aborted if we recurse too deep.
8133  * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8134  */
8135 static int
8136 dtrace_match_glob(const char *s, const char *p, int depth)
8137 {
8138         const char *olds;
8139         char s1, c;
8140         int gs;
8141
8142         if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8143                 return (-1);
8144
8145         if (s == NULL)
8146                 s = ""; /* treat NULL as empty string */
8147
8148 top:
8149         olds = s;
8150         s1 = *s++;
8151
8152         if (p == NULL)
8153                 return (0);
8154
8155         if ((c = *p++) == '\0')
8156                 return (s1 == '\0');
8157
8158         switch (c) {
8159         case '[': {
8160                 int ok = 0, notflag = 0;
8161                 char lc = '\0';
8162
8163                 if (s1 == '\0')
8164                         return (0);
8165
8166                 if (*p == '!') {
8167                         notflag = 1;
8168                         p++;
8169                 }
8170
8171                 if ((c = *p++) == '\0')
8172                         return (0);
8173
8174                 do {
8175                         if (c == '-' && lc != '\0' && *p != ']') {
8176                                 if ((c = *p++) == '\0')
8177                                         return (0);
8178                                 if (c == '\\' && (c = *p++) == '\0')
8179                                         return (0);
8180
8181                                 if (notflag) {
8182                                         if (s1 < lc || s1 > c)
8183                                                 ok++;
8184                                         else
8185                                                 return (0);
8186                                 } else if (lc <= s1 && s1 <= c)
8187                                         ok++;
8188
8189                         } else if (c == '\\' && (c = *p++) == '\0')
8190                                 return (0);
8191
8192                         lc = c; /* save left-hand 'c' for next iteration */
8193
8194                         if (notflag) {
8195                                 if (s1 != c)
8196                                         ok++;
8197                                 else
8198                                         return (0);
8199                         } else if (s1 == c)
8200                                 ok++;
8201
8202                         if ((c = *p++) == '\0')
8203                                 return (0);
8204
8205                 } while (c != ']');
8206
8207                 if (ok)
8208                         goto top;
8209
8210                 return (0);
8211         }
8212
8213         case '\\':
8214                 if ((c = *p++) == '\0')
8215                         return (0);
8216                 /*FALLTHRU*/
8217
8218         default:
8219                 if (c != s1)
8220                         return (0);
8221                 /*FALLTHRU*/
8222
8223         case '?':
8224                 if (s1 != '\0')
8225                         goto top;
8226                 return (0);
8227
8228         case '*':
8229                 while (*p == '*')
8230                         p++; /* consecutive *'s are identical to a single one */
8231
8232                 if (*p == '\0')
8233                         return (1);
8234
8235                 for (s = olds; *s != '\0'; s++) {
8236                         if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8237                                 return (gs);
8238                 }
8239
8240                 return (0);
8241         }
8242 }
8243
8244 /*ARGSUSED*/
8245 static int
8246 dtrace_match_string(const char *s, const char *p, int depth)
8247 {
8248 #pragma unused(depth) /* __APPLE__ */
8249         return (s != NULL && s == p);
8250 }
8251
8252 /*ARGSUSED*/
8253 static int
8254 dtrace_match_module(const char *s, const char *p, int depth)
8255 {
8256 #pragma unused(depth) /* __APPLE__ */
8257         size_t len;
8258         if (s == NULL || p == NULL)
8259                 return (0);
8260
8261         len = strlen(p);
8262
8263         if (strncmp(p, s, len) != 0)
8264                 return (0);
8265
8266         if (s[len] == '.' || s[len] == '\0')
8267                 return (1);
8268
8269         return (0);
8270 }
8271
8272 /*ARGSUSED*/
8273 static int
8274 dtrace_match_nul(const char *s, const char *p, int depth)
8275 {
8276 #pragma unused(s, p, depth) /* __APPLE__ */
8277         return (1); /* always match the empty pattern */
8278 }
8279
8280 /*ARGSUSED*/
8281 static int
8282 dtrace_match_nonzero(const char *s, const char *p, int depth)
8283 {
8284 #pragma unused(p, depth) /* __APPLE__ */
8285         return (s != NULL && s[0] != '\0');
8286 }
8287
8288 static int
8289 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8290     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *, void *), void *arg1, void *arg2)
8291 {
8292         dtrace_probe_t *probe;
8293         dtrace_provider_t prov_template = {
8294                 .dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
8295         };
8296
8297         dtrace_probe_t template = {
8298                 .dtpr_provider = &prov_template,
8299                 .dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
8300                 .dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
8301                 .dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
8302         };
8303
8304         dtrace_hash_t *hash = NULL;
8305         int len, rc, best = INT_MAX, nmatched = 0;
8306         dtrace_id_t i;
8307
8308         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8309
8310         /*
8311          * If the probe ID is specified in the key, just lookup by ID and
8312          * invoke the match callback once if a matching probe is found.
8313          */
8314         if (pkp->dtpk_id != DTRACE_IDNONE) {
8315                 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8316                     dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8317                         if ((*matched)(probe, arg1, arg2) == DTRACE_MATCH_FAIL)
8318                                return (DTRACE_MATCH_FAIL);
8319                         nmatched++;
8320                 }
8321                 return (nmatched);
8322         }
8323
8324         /*
8325          * We want to find the most distinct of the provider name, module name,
8326          * function name, and name.  So for each one that is not a glob
8327          * pattern or empty string, we perform a lookup in the corresponding
8328          * hash and use the hash table with the fewest collisions to do our
8329          * search.
8330          */
8331         if (pkp->dtpk_pmatch == &dtrace_match_string &&
8332             (len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) {
8333                 best = len;
8334                 hash = dtrace_byprov;
8335         }
8336
8337         if (pkp->dtpk_mmatch == &dtrace_match_string &&
8338             (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8339                 best = len;
8340                 hash = dtrace_bymod;
8341         }
8342
8343         if (pkp->dtpk_fmatch == &dtrace_match_string &&
8344             (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8345                 best = len;
8346                 hash = dtrace_byfunc;
8347         }
8348
8349         if (pkp->dtpk_nmatch == &dtrace_match_string &&
8350             (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8351                 best = len;
8352                 hash = dtrace_byname;
8353         }
8354
8355         /*
8356          * If we did not select a hash table, iterate over every probe and
8357          * invoke our callback for each one that matches our input probe key.
8358          */
8359         if (hash == NULL) {
8360                 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
8361                         if ((probe = dtrace_probes[i]) == NULL ||
8362                             dtrace_match_probe(probe, pkp, priv, uid,
8363                             zoneid) <= 0)
8364                                 continue;
8365
8366                         nmatched++;
8367
8368                        if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8369                                if (rc == DTRACE_MATCH_FAIL)
8370                                        return (DTRACE_MATCH_FAIL);
8371                                break;
8372                        }
8373                 }
8374
8375                 return (nmatched);
8376         }
8377
8378         /*
8379          * If we selected a hash table, iterate over each probe of the same key
8380          * name and invoke the callback for every probe that matches the other
8381          * attributes of our input probe key.
8382          */
8383         for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8384             probe = *(DTRACE_HASHNEXT(hash, probe))) {
8385
8386                 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8387                         continue;
8388
8389                 nmatched++;
8390
8391                 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8392                     if (rc == DTRACE_MATCH_FAIL)
8393                         return (DTRACE_MATCH_FAIL);
8394                     break;
8395                 }
8396         }
8397
8398         return (nmatched);
8399 }
8400
8401 /*
8402  * Return the function pointer dtrace_probecmp() should use to compare the
8403  * specified pattern with a string.  For NULL or empty patterns, we select
8404  * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
8405  * For non-empty non-glob strings, we use dtrace_match_string().
8406  */
8407 static dtrace_probekey_f *
8408 dtrace_probekey_func(const char *p)
8409 {
8410         char c;
8411
8412         if (p == NULL || *p == '\0')
8413                 return (&dtrace_match_nul);
8414
8415         while ((c = *p++) != '\0') {
8416                 if (c == '[' || c == '?' || c == '*' || c == '\\')
8417                         return (&dtrace_match_glob);
8418         }
8419
8420         return (&dtrace_match_string);
8421 }
8422
8423 static dtrace_probekey_f *
8424 dtrace_probekey_module_func(const char *p)
8425 {
8426         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8427
8428         dtrace_probekey_f *f = dtrace_probekey_func(p);
8429         if (f == &dtrace_match_string) {
8430                 dtrace_probe_t template = {
8431                         .dtpr_mod = (char *)(uintptr_t)p,
8432                 };
8433                 if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) {
8434                         return (&dtrace_match_module);
8435                 }
8436                 return (&dtrace_match_string);
8437         }
8438         return f;
8439 }
8440
8441 /*
8442  * Build a probe comparison key for use with dtrace_match_probe() from the
8443  * given probe description.  By convention, a null key only matches anchored
8444  * probes: if each field is the empty string, reset dtpk_fmatch to
8445  * dtrace_match_nonzero().
8446  */
8447 static void
8448 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8449 {
8450
8451         pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider);
8452         pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8453
8454         pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod);
8455         pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod);
8456
8457         pkp->dtpk_func = dtrace_strref(pdp->dtpd_func);
8458         pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8459
8460         pkp->dtpk_name = dtrace_strref(pdp->dtpd_name);
8461         pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8462
8463         pkp->dtpk_id = pdp->dtpd_id;
8464
8465         if (pkp->dtpk_id == DTRACE_IDNONE &&
8466             pkp->dtpk_pmatch == &dtrace_match_nul &&
8467             pkp->dtpk_mmatch == &dtrace_match_nul &&
8468             pkp->dtpk_fmatch == &dtrace_match_nul &&
8469             pkp->dtpk_nmatch == &dtrace_match_nul)
8470                 pkp->dtpk_fmatch = &dtrace_match_nonzero;
8471 }
8472
8473 static void
8474 dtrace_probekey_release(dtrace_probekey_t *pkp)
8475 {
8476         dtrace_strunref(pkp->dtpk_prov);
8477         dtrace_strunref(pkp->dtpk_mod);
8478         dtrace_strunref(pkp->dtpk_func);
8479         dtrace_strunref(pkp->dtpk_name);
8480 }
8481
8482 static int
8483 dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data)
8484 {
8485         if (desc == NULL)
8486                 return 1;
8487
8488         dtrace_probekey_f *func = dtrace_probekey_func(desc->dtpd_provider);
8489
8490         return func((char*)data, desc->dtpd_provider, 0);
8491 }
8492
8493 /*
8494  * DTrace Provider-to-Framework API Functions
8495  *
8496  * These functions implement much of the Provider-to-Framework API, as
8497  * described in <sys/dtrace.h>.  The parts of the API not in this section are
8498  * the functions in the API for probe management (found below), and
8499  * dtrace_probe() itself (found above).
8500  */
8501
8502 /*
8503  * Register the calling provider with the DTrace framework.  This should
8504  * generally be called by DTrace providers in their attach(9E) entry point.
8505  */
8506 int
8507 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8508     cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8509 {
8510         dtrace_provider_t *provider;
8511
8512         if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8513                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8514                     "arguments", name ? name : "<NULL>");
8515                 return (EINVAL);
8516         }
8517
8518         if (name[0] == '\0' || dtrace_badname(name)) {
8519                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8520                     "provider name", name);
8521                 return (EINVAL);
8522         }
8523
8524         if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8525             pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8526             pops->dtps_destroy == NULL ||
8527             ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8528                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8529                     "provider ops", name);
8530                 return (EINVAL);
8531         }
8532
8533         if (dtrace_badattr(&pap->dtpa_provider) ||
8534             dtrace_badattr(&pap->dtpa_mod) ||
8535             dtrace_badattr(&pap->dtpa_func) ||
8536             dtrace_badattr(&pap->dtpa_name) ||
8537             dtrace_badattr(&pap->dtpa_args)) {
8538                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8539                     "provider attributes", name);
8540                 return (EINVAL);
8541         }
8542
8543         if (priv & ~DTRACE_PRIV_ALL) {
8544                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8545                     "privilege attributes", name);
8546                 return (EINVAL);
8547         }
8548
8549         if ((priv & DTRACE_PRIV_KERNEL) &&
8550             (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8551             pops->dtps_usermode == NULL) {
8552                 cmn_err(CE_WARN, "failed to register provider '%s': need "
8553                     "dtps_usermode() op for given privilege attributes", name);
8554                 return (EINVAL);
8555         }
8556
8557         provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8558
8559         provider->dtpv_attr = *pap;
8560         provider->dtpv_priv.dtpp_flags = priv;
8561         if (cr != NULL) {
8562                 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8563                 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8564         }
8565         provider->dtpv_pops = *pops;
8566
8567         if (pops->dtps_provide == NULL) {
8568                 ASSERT(pops->dtps_provide_module != NULL);
8569                 provider->dtpv_pops.dtps_provide = dtrace_provide_nullop;
8570         }
8571
8572         if (pops->dtps_provide_module == NULL) {
8573                 ASSERT(pops->dtps_provide != NULL);
8574                 provider->dtpv_pops.dtps_provide_module =
8575                     dtrace_provide_module_nullop;
8576         }
8577
8578         if (pops->dtps_suspend == NULL) {
8579                 ASSERT(pops->dtps_resume == NULL);
8580                 provider->dtpv_pops.dtps_suspend = dtrace_suspend_nullop;
8581                 provider->dtpv_pops.dtps_resume = dtrace_resume_nullop;
8582         }
8583
8584         provider->dtpv_arg = arg;
8585         *idp = (dtrace_provider_id_t)provider;
8586
8587         if (pops == &dtrace_provider_ops) {
8588                 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8589                 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8590
8591                 provider->dtpv_name = dtrace_strref(name);
8592
8593                 ASSERT(dtrace_anon.dta_enabling == NULL);
8594
8595                 /*
8596                  * We make sure that the DTrace provider is at the head of
8597                  * the provider chain.
8598                  */
8599                 provider->dtpv_next = dtrace_provider;
8600                 dtrace_provider = provider;
8601                 return (0);
8602         }
8603
8604         lck_mtx_lock(&dtrace_provider_lock);
8605         lck_mtx_lock(&dtrace_lock);
8606
8607         provider->dtpv_name = dtrace_strref(name);
8608
8609         /*
8610          * If there is at least one provider registered, we'll add this
8611          * provider after the first provider.
8612          */
8613         if (dtrace_provider != NULL) {
8614                 provider->dtpv_next = dtrace_provider->dtpv_next;
8615                 dtrace_provider->dtpv_next = provider;
8616         } else {
8617                 dtrace_provider = provider;
8618         }
8619
8620         if (dtrace_retained != NULL) {
8621                 dtrace_enabling_provide(provider);
8622
8623                 /*
8624                  * Now we need to call dtrace_enabling_matchall_with_cond() --
8625                  * with a condition matching the provider name we just added,
8626                  * which will acquire cpu_lock and dtrace_lock.  We therefore need
8627                  * to drop all of our locks before calling into it...
8628                  */
8629                 lck_mtx_unlock(&dtrace_lock);
8630                 lck_mtx_unlock(&dtrace_provider_lock);
8631
8632                 dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name};
8633                 dtrace_enabling_matchall_with_cond(&cond);
8634
8635                 return (0);
8636         }
8637
8638         lck_mtx_unlock(&dtrace_lock);
8639         lck_mtx_unlock(&dtrace_provider_lock);
8640
8641         return (0);
8642 }
8643
8644 /*
8645  * Unregister the specified provider from the DTrace framework.  This should
8646  * generally be called by DTrace providers in their detach(9E) entry point.
8647  */
8648 int
8649 dtrace_unregister(dtrace_provider_id_t id)
8650 {
8651         dtrace_provider_t *old = (dtrace_provider_t *)id;
8652         dtrace_provider_t *prev = NULL;
8653         int self = 0;
8654         dtrace_probe_t *probe, *first = NULL, *next = NULL;
8655         dtrace_probe_t template = {
8656                 .dtpr_provider = old
8657         };
8658
8659         if (old->dtpv_pops.dtps_enable ==
8660             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
8661                 /*
8662                  * If DTrace itself is the provider, we're called with locks
8663                  * already held.
8664                  */
8665                 ASSERT(old == dtrace_provider);
8666                 ASSERT(dtrace_devi != NULL);
8667                 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8668                 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8669                 self = 1;
8670
8671                 if (dtrace_provider->dtpv_next != NULL) {
8672                         /*
8673                          * There's another provider here; return failure.
8674                          */
8675                         return (EBUSY);
8676                 }
8677         } else {
8678                 lck_mtx_lock(&dtrace_provider_lock);
8679                 lck_mtx_lock(&mod_lock);
8680                 lck_mtx_lock(&dtrace_lock);
8681         }
8682
8683         /*
8684          * If anyone has /dev/dtrace open, or if there are anonymous enabled
8685          * probes, we refuse to let providers slither away, unless this
8686          * provider has already been explicitly invalidated.
8687          */
8688         if (!old->dtpv_defunct &&
8689             (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8690             dtrace_anon.dta_state->dts_necbs > 0))) {
8691                 if (!self) {
8692                         lck_mtx_unlock(&dtrace_lock);
8693                         lck_mtx_unlock(&mod_lock);
8694                         lck_mtx_unlock(&dtrace_provider_lock);
8695                 }
8696                 return (EBUSY);
8697         }
8698
8699         /*
8700          * Attempt to destroy the probes associated with this provider.
8701          */
8702         if (old->dtpv_ecb_count!=0) {
8703                 /*
8704                  * We have at least one ECB; we can't remove this provider.
8705                  */
8706                 if (!self) {
8707                         lck_mtx_unlock(&dtrace_lock);
8708                         lck_mtx_unlock(&mod_lock);
8709                         lck_mtx_unlock(&dtrace_provider_lock);
8710                 }
8711                 return (EBUSY);
8712         }
8713
8714         /*
8715          * All of the probes for this provider are disabled; we can safely
8716          * remove all of them from their hash chains and from the probe array.
8717          */
8718         for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8719             probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8720                 if (probe->dtpr_provider != old)
8721                         continue;
8722
8723                 dtrace_probes[probe->dtpr_id - 1] = NULL;
8724                 old->dtpv_probe_count--;
8725
8726                 dtrace_hash_remove(dtrace_bymod, probe);
8727                 dtrace_hash_remove(dtrace_byfunc, probe);
8728                 dtrace_hash_remove(dtrace_byname, probe);
8729
8730                 if (first == NULL) {
8731                         first = probe;
8732                         probe->dtpr_nextmod = NULL;
8733                 } else {
8734                         /*
8735                          * Use nextmod as the chain of probes to remove
8736                          */
8737                         probe->dtpr_nextmod = first;
8738                         first = probe;
8739                 }
8740         }
8741
8742         for (probe = first; probe != NULL; probe = next) {
8743                 next = probe->dtpr_nextmod;
8744                 dtrace_hash_remove(dtrace_byprov, probe);
8745         }
8746
8747         /*
8748          * The provider's probes have been removed from the hash chains and
8749          * from the probe array.  Now issue a dtrace_sync() to be sure that
8750          * everyone has cleared out from any probe array processing.
8751          */
8752         dtrace_sync();
8753
8754         for (probe = first; probe != NULL; probe = next) {
8755                 next = probe->dtpr_nextmod;
8756
8757                 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8758                     probe->dtpr_arg);
8759                 dtrace_strunref(probe->dtpr_mod);
8760                 dtrace_strunref(probe->dtpr_func);
8761                 dtrace_strunref(probe->dtpr_name);
8762                 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8763                 zfree(dtrace_probe_t_zone, probe);
8764         }
8765
8766         if ((prev = dtrace_provider) == old) {
8767                 ASSERT(self || dtrace_devi == NULL);
8768                 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8769                 dtrace_provider = old->dtpv_next;
8770         } else {
8771                 while (prev != NULL && prev->dtpv_next != old)
8772                         prev = prev->dtpv_next;
8773
8774                 if (prev == NULL) {
8775                         panic("attempt to unregister non-existent "
8776                             "dtrace provider %p\n", (void *)id);
8777                 }
8778
8779                 prev->dtpv_next = old->dtpv_next;
8780         }
8781
8782         dtrace_strunref(old->dtpv_name);
8783
8784         if (!self) {
8785                 lck_mtx_unlock(&dtrace_lock);
8786                 lck_mtx_unlock(&mod_lock);
8787                 lck_mtx_unlock(&dtrace_provider_lock);
8788         }
8789
8790         kmem_free(old, sizeof (dtrace_provider_t));
8791
8792         return (0);
8793 }
8794
8795 /*
8796  * Invalidate the specified provider.  All subsequent probe lookups for the
8797  * specified provider will fail, but its probes will not be removed.
8798  */
8799 void
8800 dtrace_invalidate(dtrace_provider_id_t id)
8801 {
8802         dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8803
8804         ASSERT(pvp->dtpv_pops.dtps_enable !=
8805             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8806
8807         lck_mtx_lock(&dtrace_provider_lock);
8808         lck_mtx_lock(&dtrace_lock);
8809
8810         pvp->dtpv_defunct = 1;
8811
8812         lck_mtx_unlock(&dtrace_lock);
8813         lck_mtx_unlock(&dtrace_provider_lock);
8814 }
8815
8816 /*
8817  * Indicate whether or not DTrace has attached.
8818  */
8819 int
8820 dtrace_attached(void)
8821 {
8822         /*
8823          * dtrace_provider will be non-NULL iff the DTrace driver has
8824          * attached.  (It's non-NULL because DTrace is always itself a
8825          * provider.)
8826          */
8827         return (dtrace_provider != NULL);
8828 }
8829
8830 /*
8831  * Remove all the unenabled probes for the given provider.  This function is
8832  * not unlike dtrace_unregister(), except that it doesn't remove the provider
8833  * -- just as many of its associated probes as it can.
8834  */
8835 int
8836 dtrace_condense(dtrace_provider_id_t id)
8837 {
8838         dtrace_provider_t *prov = (dtrace_provider_t *)id;
8839         dtrace_probe_t *probe, *first = NULL;
8840         dtrace_probe_t template = {
8841                 .dtpr_provider = prov
8842         };
8843
8844         /*
8845          * Make sure this isn't the dtrace provider itself.
8846          */
8847         ASSERT(prov->dtpv_pops.dtps_enable !=
8848           (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8849
8850         lck_mtx_lock(&dtrace_provider_lock);
8851         lck_mtx_lock(&dtrace_lock);
8852
8853         /*
8854          * Attempt to destroy the probes associated with this provider.
8855          */
8856         for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8857             probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8858
8859                 if (probe->dtpr_provider != prov)
8860                         continue;
8861
8862                 if (probe->dtpr_ecb != NULL)
8863                         continue;
8864
8865                 dtrace_probes[probe->dtpr_id - 1] = NULL;
8866                 prov->dtpv_probe_count--;
8867
8868                 dtrace_hash_remove(dtrace_bymod, probe);
8869                 dtrace_hash_remove(dtrace_byfunc, probe);
8870                 dtrace_hash_remove(dtrace_byname, probe);
8871
8872                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
8873                     probe->dtpr_arg);
8874                 dtrace_strunref(probe->dtpr_mod);
8875                 dtrace_strunref(probe->dtpr_func);
8876                 dtrace_strunref(probe->dtpr_name);
8877                 if (first == NULL) {
8878                         first = probe;
8879                         probe->dtpr_nextmod = NULL;
8880                 } else {
8881                         /*
8882                          * Use nextmod as the chain of probes to remove
8883                          */
8884                         probe->dtpr_nextmod = first;
8885                         first = probe;
8886                 }
8887         }
8888
8889         for (probe = first; probe != NULL; probe = first) {
8890                 first = probe->dtpr_nextmod;
8891                 dtrace_hash_remove(dtrace_byprov, probe);
8892                 vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), 1);
8893                 zfree(dtrace_probe_t_zone, probe);
8894         }
8895
8896         lck_mtx_unlock(&dtrace_lock);
8897         lck_mtx_unlock(&dtrace_provider_lock);
8898
8899         return (0);
8900 }
8901
8902 /*
8903  * DTrace Probe Management Functions
8904  *
8905  * The functions in this section perform the DTrace probe management,
8906  * including functions to create probes, look-up probes, and call into the
8907  * providers to request that probes be provided.  Some of these functions are
8908  * in the Provider-to-Framework API; these functions can be identified by the
8909  * fact that they are not declared "static".
8910  */
8911
8912 /*
8913  * Create a probe with the specified module name, function name, and name.
8914  */
8915 dtrace_id_t
8916 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8917     const char *func, const char *name, int aframes, void *arg)
8918 {
8919         dtrace_probe_t *probe, **probes;
8920         dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8921         dtrace_id_t id;
8922
8923         if (provider == dtrace_provider) {
8924                 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8925         } else {
8926                 lck_mtx_lock(&dtrace_lock);
8927         }
8928
8929         id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8930             VM_BESTFIT | VM_SLEEP);
8931
8932         probe = zalloc(dtrace_probe_t_zone);
8933         bzero(probe, sizeof (dtrace_probe_t));
8934
8935         probe->dtpr_id = id;
8936         probe->dtpr_gen = dtrace_probegen++;
8937         probe->dtpr_mod = dtrace_strref(mod);
8938         probe->dtpr_func = dtrace_strref(func);
8939         probe->dtpr_name = dtrace_strref(name);
8940         probe->dtpr_arg = arg;
8941         probe->dtpr_aframes = aframes;
8942         probe->dtpr_provider = provider;
8943
8944         dtrace_hash_add(dtrace_byprov, probe);
8945         dtrace_hash_add(dtrace_bymod, probe);
8946         dtrace_hash_add(dtrace_byfunc, probe);
8947         dtrace_hash_add(dtrace_byname, probe);
8948
8949         if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
8950                 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8951                 size_t nsize = osize * 2;
8952
8953                 probes = kmem_zalloc(nsize, KM_SLEEP);
8954
8955                 dtrace_probe_t **oprobes = dtrace_probes;
8956
8957                 bcopy(oprobes, probes, osize);
8958                 dtrace_membar_producer();
8959                 dtrace_probes = probes;
8960
8961                 dtrace_sync();
8962
8963                 /*
8964                  * All CPUs are now seeing the new probes array; we can
8965                  * safely free the old array.
8966                  */
8967                 kmem_free(oprobes, osize);
8968                 dtrace_nprobes *= 2;
8969
8970                 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
8971         }
8972
8973         ASSERT(dtrace_probes[id - 1] == NULL);
8974         dtrace_probes[id - 1] = probe;
8975         provider->dtpv_probe_count++;
8976
8977         if (provider != dtrace_provider)
8978                 lck_mtx_unlock(&dtrace_lock);
8979
8980         return (id);
8981 }
8982
8983 static dtrace_probe_t *
8984 dtrace_probe_lookup_id(dtrace_id_t id)
8985 {
8986         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8987
8988         if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
8989                 return (NULL);
8990
8991         return (dtrace_probes[id - 1]);
8992 }
8993
8994 static int
8995 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg1, void *arg2)
8996 {
8997 #pragma unused(arg2)
8998         *((dtrace_id_t *)arg1) = probe->dtpr_id;
8999
9000         return (DTRACE_MATCH_DONE);
9001 }
9002
9003 /*
9004  * Look up a probe based on provider and one or more of module name, function
9005  * name and probe name.
9006  */
9007 dtrace_id_t
9008 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
9009     const char *func, const char *name)
9010 {
9011         dtrace_probekey_t pkey;
9012         dtrace_id_t id;
9013         int match;
9014
9015         lck_mtx_lock(&dtrace_lock);
9016
9017         pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name);
9018         pkey.dtpk_pmatch = &dtrace_match_string;
9019         pkey.dtpk_mod = dtrace_strref(mod);
9020         pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9021         pkey.dtpk_func = dtrace_strref(func);
9022         pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9023         pkey.dtpk_name = dtrace_strref(name);
9024         pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9025         pkey.dtpk_id = DTRACE_IDNONE;
9026
9027         match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9028             dtrace_probe_lookup_match, &id, NULL);
9029
9030         dtrace_probekey_release(&pkey);
9031
9032         lck_mtx_unlock(&dtrace_lock);
9033
9034         ASSERT(match == 1 || match == 0);
9035         return (match ? id : 0);
9036 }
9037
9038 /*
9039  * Returns the probe argument associated with the specified probe.
9040  */
9041 void *
9042 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9043 {
9044         dtrace_probe_t *probe;
9045         void *rval = NULL;
9046
9047         lck_mtx_lock(&dtrace_lock);
9048
9049         if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9050             probe->dtpr_provider == (dtrace_provider_t *)id)
9051                 rval = probe->dtpr_arg;
9052
9053         lck_mtx_unlock(&dtrace_lock);
9054
9055         return (rval);
9056 }
9057
9058 /*
9059  * Copy a probe into a probe description.
9060  */
9061 static void
9062 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9063 {
9064         bzero(pdp, sizeof (dtrace_probedesc_t));
9065         pdp->dtpd_id = prp->dtpr_id;
9066
9067         /* APPLE NOTE: Darwin employs size bounded string operation. */
9068         (void) strlcpy(pdp->dtpd_provider,
9069             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
9070
9071         (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
9072         (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
9073         (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
9074 }
9075
9076 /*
9077  * Called to indicate that a probe -- or probes -- should be provided by a
9078  * specfied provider.  If the specified description is NULL, the provider will
9079  * be told to provide all of its probes.  (This is done whenever a new
9080  * consumer comes along, or whenever a retained enabling is to be matched.) If
9081  * the specified description is non-NULL, the provider is given the
9082  * opportunity to dynamically provide the specified probe, allowing providers
9083  * to support the creation of probes on-the-fly.  (So-called _autocreated_
9084  * probes.)  If the provider is NULL, the operations will be applied to all
9085  * providers; if the provider is non-NULL the operations will only be applied
9086  * to the specified provider.  The dtrace_provider_lock must be held, and the
9087  * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9088  * will need to grab the dtrace_lock when it reenters the framework through
9089  * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9090  */
9091 static void
9092 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9093 {
9094         struct modctl *ctl;
9095         int all = 0;
9096
9097         LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
9098
9099         if (prv == NULL) {
9100                 all = 1;
9101                 prv = dtrace_provider;
9102         }
9103
9104         do {
9105                 /*
9106                  * First, call the blanket provide operation.
9107                  */
9108                 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9109
9110                 /*
9111                  * Now call the per-module provide operation.  We will grab
9112                  * mod_lock to prevent the list from being modified.  Note
9113                  * that this also prevents the mod_busy bits from changing.
9114                  * (mod_busy can only be changed with mod_lock held.)
9115                  */
9116                 lck_mtx_lock(&mod_lock);
9117
9118                 ctl = dtrace_modctl_list;
9119                 while (ctl) {
9120                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9121                         ctl = ctl->mod_next;
9122                 }
9123
9124                 lck_mtx_unlock(&mod_lock);
9125         } while (all && (prv = prv->dtpv_next) != NULL);
9126 }
9127
9128 /*
9129  * Iterate over each probe, and call the Framework-to-Provider API function
9130  * denoted by offs.
9131  */
9132 static void
9133 dtrace_probe_foreach(uintptr_t offs)
9134 {
9135         dtrace_provider_t *prov;
9136         void (*func)(void *, dtrace_id_t, void *);
9137         dtrace_probe_t *probe;
9138         dtrace_icookie_t cookie;
9139         int i;
9140
9141         /*
9142          * We disable interrupts to walk through the probe array.  This is
9143          * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9144          * won't see stale data.
9145          */
9146         cookie = dtrace_interrupt_disable();
9147
9148         for (i = 0; i < dtrace_nprobes; i++) {
9149                 if ((probe = dtrace_probes[i]) == NULL)
9150                         continue;
9151
9152                 if (probe->dtpr_ecb == NULL) {
9153                         /*
9154                          * This probe isn't enabled -- don't call the function.
9155                          */
9156                         continue;
9157                 }
9158
9159                 prov = probe->dtpr_provider;
9160                 func = *((void(**)(void *, dtrace_id_t, void *))
9161                     ((uintptr_t)&prov->dtpv_pops + offs));
9162
9163                 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9164         }
9165
9166         dtrace_interrupt_enable(cookie);
9167 }
9168
9169 static int
9170 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtrace_ecbdesc_t *ep)
9171 {
9172         dtrace_probekey_t pkey;
9173         uint32_t priv;
9174         uid_t uid;
9175         zoneid_t zoneid;
9176         int err;
9177
9178         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9179
9180         dtrace_ecb_create_cache = NULL;
9181
9182         if (desc == NULL) {
9183                 /*
9184                  * If we're passed a NULL description, we're being asked to
9185                  * create an ECB with a NULL probe.
9186                  */
9187                 (void) dtrace_ecb_create_enable(NULL, enab, ep);
9188                 return (0);
9189         }
9190
9191         dtrace_probekey(desc, &pkey);
9192         dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9193             &priv, &uid, &zoneid);
9194
9195         err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep);
9196
9197         dtrace_probekey_release(&pkey);
9198
9199         return err;
9200 }
9201
9202 /*
9203  * DTrace Helper Provider Functions
9204  */
9205 static void
9206 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9207 {
9208         attr->dtat_name = DOF_ATTR_NAME(dofattr);
9209         attr->dtat_data = DOF_ATTR_DATA(dofattr);
9210         attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9211 }
9212
9213 static void
9214 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9215     const dof_provider_t *dofprov, char *strtab)
9216 {
9217         hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9218         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9219             dofprov->dofpv_provattr);
9220         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9221             dofprov->dofpv_modattr);
9222         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9223             dofprov->dofpv_funcattr);
9224         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9225             dofprov->dofpv_nameattr);
9226         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9227             dofprov->dofpv_argsattr);
9228 }
9229
9230 static void
9231 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9232 {
9233         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9234         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9235         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9236         dof_provider_t *provider;
9237         dof_probe_t *probe;
9238         uint32_t *off, *enoff;
9239         uint8_t *arg;
9240         char *strtab;
9241         uint_t i, nprobes;
9242         dtrace_helper_provdesc_t dhpv;
9243         dtrace_helper_probedesc_t dhpb;
9244         dtrace_meta_t *meta = dtrace_meta_pid;
9245         dtrace_mops_t *mops = &meta->dtm_mops;
9246         void *parg;
9247
9248         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9249         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9250             provider->dofpv_strtab * dof->dofh_secsize);
9251         prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9252             provider->dofpv_probes * dof->dofh_secsize);
9253         arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9254             provider->dofpv_prargs * dof->dofh_secsize);
9255         off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9256             provider->dofpv_proffs * dof->dofh_secsize);
9257
9258         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9259         off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9260         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9261         enoff = NULL;
9262
9263         /*
9264          * See dtrace_helper_provider_validate().
9265          */
9266         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9267             provider->dofpv_prenoffs != DOF_SECT_NONE) {
9268                 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9269                     provider->dofpv_prenoffs * dof->dofh_secsize);
9270                 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9271         }
9272
9273         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9274
9275         /*
9276          * Create the provider.
9277          */
9278         dtrace_dofprov2hprov(&dhpv, provider, strtab);
9279
9280         if ((parg = mops->dtms_provide_proc(meta->dtm_arg, &dhpv, p)) == NULL)
9281                 return;
9282
9283         meta->dtm_count++;
9284
9285         /*
9286          * Create the probes.
9287          */
9288         for (i = 0; i < nprobes; i++) {
9289                 probe = (dof_probe_t *)(uintptr_t)(daddr +
9290                     prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9291
9292                 dhpb.dthpb_mod = dhp->dofhp_mod;
9293                 dhpb.dthpb_func = strtab + probe->dofpr_func;
9294                 dhpb.dthpb_name = strtab + probe->dofpr_name;
9295 #if !defined(__APPLE__)
9296                 dhpb.dthpb_base = probe->dofpr_addr;
9297 #else
9298                 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
9299 #endif
9300                 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
9301                 dhpb.dthpb_noffs = probe->dofpr_noffs;
9302                 if (enoff != NULL) {
9303                         dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
9304                         dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9305                 } else {
9306                         dhpb.dthpb_enoffs = NULL;
9307                         dhpb.dthpb_nenoffs = 0;
9308                 }
9309                 dhpb.dthpb_args = arg + probe->dofpr_argidx;
9310                 dhpb.dthpb_nargc = probe->dofpr_nargc;
9311                 dhpb.dthpb_xargc = probe->dofpr_xargc;
9312                 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9313                 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9314
9315                 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9316         }
9317
9318         /*
9319          * Since we just created probes, we need to match our enablings
9320          * against those, with a precondition knowing that we have only
9321          * added probes from this provider
9322          */
9323         char *prov_name = mops->dtms_provider_name(parg);
9324         ASSERT(prov_name != NULL);
9325         dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name};
9326
9327         dtrace_enabling_matchall_with_cond(&cond);
9328 }
9329
9330 static void
9331 dtrace_helper_provide(dof_helper_t *dhp, proc_t *p)
9332 {
9333         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9334         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9335         uint32_t i;
9336
9337         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9338
9339         for (i = 0; i < dof->dofh_secnum; i++) {
9340                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9341                     dof->dofh_secoff + i * dof->dofh_secsize);
9342
9343                 if (sec->dofs_type != DOF_SECT_PROVIDER)
9344                         continue;
9345
9346                 dtrace_helper_provide_one(dhp, sec, p);
9347         }
9348 }
9349
9350 static void
9351 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9352 {
9353         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9354         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9355         dof_sec_t *str_sec;
9356         dof_provider_t *provider;
9357         char *strtab;
9358         dtrace_helper_provdesc_t dhpv;
9359         dtrace_meta_t *meta = dtrace_meta_pid;
9360         dtrace_mops_t *mops = &meta->dtm_mops;
9361
9362         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9363         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9364             provider->dofpv_strtab * dof->dofh_secsize);
9365
9366         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9367
9368         /*
9369          * Create the provider.
9370          */
9371         dtrace_dofprov2hprov(&dhpv, provider, strtab);
9372
9373         mops->dtms_remove_proc(meta->dtm_arg, &dhpv, p);
9374
9375         meta->dtm_count--;
9376 }
9377
9378 static void
9379 dtrace_helper_provider_remove(dof_helper_t *dhp, proc_t *p)
9380 {
9381         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9382         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9383         uint32_t i;
9384
9385         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9386
9387         for (i = 0; i < dof->dofh_secnum; i++) {
9388                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9389                     dof->dofh_secoff + i * dof->dofh_secsize);
9390
9391                 if (sec->dofs_type != DOF_SECT_PROVIDER)
9392                         continue;
9393
9394                 dtrace_helper_provider_remove_one(dhp, sec, p);
9395         }
9396 }
9397
9398 /*
9399  * DTrace Meta Provider-to-Framework API Functions
9400  *
9401  * These functions implement the Meta Provider-to-Framework API, as described
9402  * in <sys/dtrace.h>.
9403  */
9404 int
9405 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9406     dtrace_meta_provider_id_t *idp)
9407 {
9408         dtrace_meta_t *meta;
9409         dtrace_helpers_t *help, *next;
9410         uint_t i;
9411
9412         *idp = DTRACE_METAPROVNONE;
9413
9414         /*
9415          * We strictly don't need the name, but we hold onto it for
9416          * debuggability. All hail error queues!
9417          */
9418         if (name == NULL) {
9419                 cmn_err(CE_WARN, "failed to register meta-provider: "
9420                     "invalid name");
9421                 return (EINVAL);
9422         }
9423
9424         if (mops == NULL ||
9425             mops->dtms_create_probe == NULL ||
9426             mops->dtms_provide_proc == NULL ||
9427             mops->dtms_remove_proc == NULL) {
9428                 cmn_err(CE_WARN, "failed to register meta-register %s: "
9429                     "invalid ops", name);
9430                 return (EINVAL);
9431         }
9432
9433         meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9434         meta->dtm_mops = *mops;
9435         meta->dtm_arg = arg;
9436
9437         lck_mtx_lock(&dtrace_meta_lock);
9438         lck_mtx_lock(&dtrace_lock);
9439
9440         if (dtrace_meta_pid != NULL) {
9441                 lck_mtx_unlock(&dtrace_lock);
9442                 lck_mtx_unlock(&dtrace_meta_lock);
9443                 cmn_err(CE_WARN, "failed to register meta-register %s: "
9444                     "user-land meta-provider exists", name);
9445                 kmem_free(meta, sizeof (dtrace_meta_t));
9446                 return (EINVAL);
9447         }
9448
9449         meta->dtm_name = dtrace_strref(name);
9450
9451         dtrace_meta_pid = meta;
9452         *idp = (dtrace_meta_provider_id_t)meta;
9453
9454         /*
9455          * If there are providers and probes ready to go, pass them
9456          * off to the new meta provider now.
9457          */
9458
9459         help = dtrace_deferred_pid;
9460         dtrace_deferred_pid = NULL;
9461
9462         lck_mtx_unlock(&dtrace_lock);
9463
9464         while (help != NULL) {
9465                 for (i = 0; i < help->dthps_nprovs; i++) {
9466                         proc_t *p = proc_find(help->dthps_pid);
9467                         if (p == PROC_NULL)
9468                                 continue;
9469                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9470                             p);
9471                         proc_rele(p);
9472                 }
9473
9474                 next = help->dthps_next;
9475                 help->dthps_next = NULL;
9476                 help->dthps_prev = NULL;
9477                 help->dthps_deferred = 0;
9478                 help = next;
9479         }
9480
9481         lck_mtx_unlock(&dtrace_meta_lock);
9482
9483         return (0);
9484 }
9485
9486 int
9487 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9488 {
9489         dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9490
9491         lck_mtx_lock(&dtrace_meta_lock);
9492         lck_mtx_lock(&dtrace_lock);
9493
9494         if (old == dtrace_meta_pid) {
9495                 pp = &dtrace_meta_pid;
9496         } else {
9497                 panic("attempt to unregister non-existent "
9498                     "dtrace meta-provider %p\n", (void *)old);
9499         }
9500
9501         if (old->dtm_count != 0) {
9502                 lck_mtx_unlock(&dtrace_lock);
9503                 lck_mtx_unlock(&dtrace_meta_lock);
9504                 return (EBUSY);
9505         }
9506
9507         *pp = NULL;
9508
9509         dtrace_strunref(old->dtm_name);
9510
9511         lck_mtx_unlock(&dtrace_lock);
9512         lck_mtx_unlock(&dtrace_meta_lock);
9513
9514         kmem_free(old, sizeof (dtrace_meta_t));
9515
9516         return (0);
9517 }
9518
9519
9520 /*
9521  * DTrace DIF Object Functions
9522  */
9523 static int
9524 dtrace_difo_err(uint_t pc, const char *format, ...)
9525 {
9526         if (dtrace_err_verbose) {
9527                 va_list alist;
9528
9529                 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9530                 va_start(alist, format);
9531                 (void) vuprintf(format, alist);
9532                 va_end(alist);
9533         }
9534
9535 #ifdef DTRACE_ERRDEBUG
9536         dtrace_errdebug(format);
9537 #endif
9538         return (1);
9539 }
9540
9541 /*
9542  * Validate a DTrace DIF object by checking the IR instructions.  The following
9543  * rules are currently enforced by dtrace_difo_validate():
9544  *
9545  * 1. Each instruction must have a valid opcode
9546  * 2. Each register, string, variable, or subroutine reference must be valid
9547  * 3. No instruction can modify register %r0 (must be zero)
9548  * 4. All instruction reserved bits must be set to zero
9549  * 5. The last instruction must be a "ret" instruction
9550  * 6. All branch targets must reference a valid instruction _after_ the branch
9551  */
9552 static int
9553 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9554     cred_t *cr)
9555 {
9556         int err = 0;
9557         uint_t i;
9558
9559         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9560         int kcheckload;
9561         uint_t pc;
9562         int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
9563
9564         kcheckload = cr == NULL ||
9565             (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9566
9567         dp->dtdo_destructive = 0;
9568
9569         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9570                 dif_instr_t instr = dp->dtdo_buf[pc];
9571
9572                 uint_t r1 = DIF_INSTR_R1(instr);
9573                 uint_t r2 = DIF_INSTR_R2(instr);
9574                 uint_t rd = DIF_INSTR_RD(instr);
9575                 uint_t rs = DIF_INSTR_RS(instr);
9576                 uint_t label = DIF_INSTR_LABEL(instr);
9577                 uint_t v = DIF_INSTR_VAR(instr);
9578                 uint_t subr = DIF_INSTR_SUBR(instr);
9579                 uint_t type = DIF_INSTR_TYPE(instr);
9580                 uint_t op = DIF_INSTR_OP(instr);
9581
9582                 switch (op) {
9583                 case DIF_OP_OR:
9584                 case DIF_OP_XOR:
9585                 case DIF_OP_AND:
9586                 case DIF_OP_SLL:
9587                 case DIF_OP_SRL:
9588                 case DIF_OP_SRA:
9589                 case DIF_OP_SUB:
9590                 case DIF_OP_ADD:
9591                 case DIF_OP_MUL:
9592                 case DIF_OP_SDIV:
9593                 case DIF_OP_UDIV:
9594                 case DIF_OP_SREM:
9595                 case DIF_OP_UREM:
9596                 case DIF_OP_COPYS:
9597                         if (r1 >= nregs)
9598                                 err += efunc(pc, "invalid register %u\n", r1);
9599                         if (r2 >= nregs)
9600                                 err += efunc(pc, "invalid register %u\n", r2);
9601                         if (rd >= nregs)
9602                                 err += efunc(pc, "invalid register %u\n", rd);
9603                         if (rd == 0)
9604                                 err += efunc(pc, "cannot write to %%r0\n");
9605                         break;
9606                 case DIF_OP_NOT:
9607                 case DIF_OP_MOV:
9608                 case DIF_OP_ALLOCS:
9609                         if (r1 >= nregs)
9610                                 err += efunc(pc, "invalid register %u\n", r1);
9611                         if (r2 != 0)
9612                                 err += efunc(pc, "non-zero reserved bits\n");
9613                         if (rd >= nregs)
9614                                 err += efunc(pc, "invalid register %u\n", rd);
9615                         if (rd == 0)
9616                                 err += efunc(pc, "cannot write to %%r0\n");
9617                         break;
9618                 case DIF_OP_LDSB:
9619                 case DIF_OP_LDSH:
9620                 case DIF_OP_LDSW:
9621                 case DIF_OP_LDUB:
9622                 case DIF_OP_LDUH:
9623                 case DIF_OP_LDUW:
9624                 case DIF_OP_LDX:
9625                         if (r1 >= nregs)
9626                                 err += efunc(pc, "invalid register %u\n", r1);
9627                         if (r2 != 0)
9628                                 err += efunc(pc, "non-zero reserved bits\n");
9629                         if (rd >= nregs)
9630                                 err += efunc(pc, "invalid register %u\n", rd);
9631                         if (rd == 0)
9632                                 err += efunc(pc, "cannot write to %%r0\n");
9633                         if (kcheckload)
9634                                 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9635                                     DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9636                         break;
9637                 case DIF_OP_RLDSB:
9638                 case DIF_OP_RLDSH:
9639                 case DIF_OP_RLDSW:
9640                 case DIF_OP_RLDUB:
9641                 case DIF_OP_RLDUH:
9642                 case DIF_OP_RLDUW:
9643                 case DIF_OP_RLDX:
9644                         if (r1 >= nregs)
9645                                 err += efunc(pc, "invalid register %u\n", r1);
9646                         if (r2 != 0)
9647                                 err += efunc(pc, "non-zero reserved bits\n");
9648                         if (rd >= nregs)
9649                                 err += efunc(pc, "invalid register %u\n", rd);
9650                         if (rd == 0)
9651                                 err += efunc(pc, "cannot write to %%r0\n");
9652                         break;
9653                 case DIF_OP_ULDSB:
9654                 case DIF_OP_ULDSH:
9655                 case DIF_OP_ULDSW:
9656                 case DIF_OP_ULDUB:
9657                 case DIF_OP_ULDUH:
9658                 case DIF_OP_ULDUW:
9659                 case DIF_OP_ULDX:
9660                         if (r1 >= nregs)
9661                                 err += efunc(pc, "invalid register %u\n", r1);
9662                         if (r2 != 0)
9663                                 err += efunc(pc, "non-zero reserved bits\n");
9664                         if (rd >= nregs)
9665                                 err += efunc(pc, "invalid register %u\n", rd);
9666                         if (rd == 0)
9667                                 err += efunc(pc, "cannot write to %%r0\n");
9668                         break;
9669                 case DIF_OP_STB:
9670                 case DIF_OP_STH:
9671                 case DIF_OP_STW:
9672                 case DIF_OP_STX:
9673                         if (r1 >= nregs)
9674                                 err += efunc(pc, "invalid register %u\n", r1);
9675                         if (r2 != 0)
9676                                 err += efunc(pc, "non-zero reserved bits\n");
9677                         if (rd >= nregs)
9678                                 err += efunc(pc, "invalid register %u\n", rd);
9679                         if (rd == 0)
9680                                 err += efunc(pc, "cannot write to 0 address\n");
9681                         break;
9682                 case DIF_OP_CMP:
9683                 case DIF_OP_SCMP:
9684                         if (r1 >= nregs)
9685                                 err += efunc(pc, "invalid register %u\n", r1);
9686                         if (r2 >= nregs)
9687                                 err += efunc(pc, "invalid register %u\n", r2);
9688                         if (rd != 0)
9689                                 err += efunc(pc, "non-zero reserved bits\n");
9690                         break;
9691                 case DIF_OP_TST:
9692                         if (r1 >= nregs)
9693                                 err += efunc(pc, "invalid register %u\n", r1);
9694                         if (r2 != 0 || rd != 0)
9695                                 err += efunc(pc, "non-zero reserved bits\n");
9696                         break;
9697                 case DIF_OP_BA:
9698                 case DIF_OP_BE:
9699                 case DIF_OP_BNE:
9700                 case DIF_OP_BG:
9701                 case DIF_OP_BGU:
9702                 case DIF_OP_BGE:
9703                 case DIF_OP_BGEU:
9704                 case DIF_OP_BL:
9705                 case DIF_OP_BLU:
9706                 case DIF_OP_BLE:
9707                 case DIF_OP_BLEU:
9708                         if (label >= dp->dtdo_len) {
9709                                 err += efunc(pc, "invalid branch target %u\n",
9710                                     label);
9711                         }
9712                         if (label <= pc) {
9713                                 err += efunc(pc, "backward branch to %u\n",
9714                                     label);
9715                         }
9716                         break;
9717                 case DIF_OP_RET:
9718                         if (r1 != 0 || r2 != 0)
9719                                 err += efunc(pc, "non-zero reserved bits\n");
9720                         if (rd >= nregs)
9721                                 err += efunc(pc, "invalid register %u\n", rd);
9722                         break;
9723                 case DIF_OP_NOP:
9724                 case DIF_OP_POPTS:
9725                 case DIF_OP_FLUSHTS:
9726                         if (r1 != 0 || r2 != 0 || rd != 0)
9727                                 err += efunc(pc, "non-zero reserved bits\n");
9728                         break;
9729                 case DIF_OP_SETX:
9730                         if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9731                                 err += efunc(pc, "invalid integer ref %u\n",
9732                                     DIF_INSTR_INTEGER(instr));
9733                         }
9734                         if (rd >= nregs)
9735                                 err += efunc(pc, "invalid register %u\n", rd);
9736                         if (rd == 0)
9737                                 err += efunc(pc, "cannot write to %%r0\n");
9738                         break;
9739                 case DIF_OP_SETS:
9740                         if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9741                                 err += efunc(pc, "invalid string ref %u\n",
9742                                     DIF_INSTR_STRING(instr));
9743                         }
9744                         if (rd >= nregs)
9745                                 err += efunc(pc, "invalid register %u\n", rd);
9746                         if (rd == 0)
9747                                 err += efunc(pc, "cannot write to %%r0\n");
9748                         break;
9749                 case DIF_OP_LDGA:
9750                 case DIF_OP_LDTA:
9751                         if (r1 > DIF_VAR_ARRAY_MAX)
9752                                 err += efunc(pc, "invalid array %u\n", r1);
9753                         if (r2 >= nregs)
9754                                 err += efunc(pc, "invalid register %u\n", r2);
9755                         if (rd >= nregs)
9756                                 err += efunc(pc, "invalid register %u\n", rd);
9757                         if (rd == 0)
9758                                 err += efunc(pc, "cannot write to %%r0\n");
9759                         break;
9760                 case DIF_OP_LDGS:
9761                 case DIF_OP_LDTS:
9762                 case DIF_OP_LDLS:
9763                 case DIF_OP_LDGAA:
9764                 case DIF_OP_LDTAA:
9765                         if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9766                                 err += efunc(pc, "invalid variable %u\n", v);
9767                         if (rd >= nregs)
9768                                 err += efunc(pc, "invalid register %u\n", rd);
9769                         if (rd == 0)
9770                                 err += efunc(pc, "cannot write to %%r0\n");
9771                         break;
9772                 case DIF_OP_STGS:
9773                 case DIF_OP_STTS:
9774                 case DIF_OP_STLS:
9775                 case DIF_OP_STGAA:
9776                 case DIF_OP_STTAA:
9777                         if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9778                                 err += efunc(pc, "invalid variable %u\n", v);
9779                         if (rs >= nregs)
9780                                 err += efunc(pc, "invalid register %u\n", rd);
9781                         break;
9782                 case DIF_OP_CALL:
9783                         if (subr > DIF_SUBR_MAX &&
9784                            !(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX))
9785                                 err += efunc(pc, "invalid subr %u\n", subr);
9786                         if (rd >= nregs)
9787                                 err += efunc(pc, "invalid register %u\n", rd);
9788                         if (rd == 0)
9789                                 err += efunc(pc, "cannot write to %%r0\n");
9790
9791                         if (subr == DIF_SUBR_COPYOUT ||
9792                             subr == DIF_SUBR_COPYOUTSTR ||
9793                             subr == DIF_SUBR_KDEBUG_TRACE ||
9794                             subr == DIF_SUBR_KDEBUG_TRACE_STRING) {
9795                                 dp->dtdo_destructive = 1;
9796                         }
9797                         break;
9798                 case DIF_OP_PUSHTR:
9799                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9800                                 err += efunc(pc, "invalid ref type %u\n", type);
9801                         if (r2 >= nregs)
9802                                 err += efunc(pc, "invalid register %u\n", r2);
9803                         if (rs >= nregs)
9804                                 err += efunc(pc, "invalid register %u\n", rs);
9805                         break;
9806                 case DIF_OP_PUSHTV:
9807                         if (type != DIF_TYPE_CTF)
9808                                 err += efunc(pc, "invalid val type %u\n", type);
9809                         if (r2 >= nregs)
9810                                 err += efunc(pc, "invalid register %u\n", r2);
9811                         if (rs >= nregs)
9812                                 err += efunc(pc, "invalid register %u\n", rs);
9813                         break;
9814                 case DIF_OP_STRIP:
9815                         if (r1 >= nregs)
9816                                 err += efunc(pc, "invalid register %u\n", r1);
9817                         if (!dtrace_is_valid_ptrauth_key(r2))
9818                                 err += efunc(pc, "invalid key\n");
9819                         if (rd >= nregs)
9820                                 err += efunc(pc, "invalid register %u\n", rd);
9821                         if (rd == 0)
9822                                 err += efunc(pc, "cannot write to %%r0\n");
9823                         break;
9824                 default:
9825                         err += efunc(pc, "invalid opcode %u\n",
9826                             DIF_INSTR_OP(instr));
9827                 }
9828         }
9829
9830         if (dp->dtdo_len != 0 &&
9831             DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9832                 err += efunc(dp->dtdo_len - 1,
9833                     "expected 'ret' as last DIF instruction\n");
9834         }
9835
9836         if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9837                 /*
9838                  * If we're not returning by reference, the size must be either
9839                  * 0 or the size of one of the base types.
9840                  */
9841                 switch (dp->dtdo_rtype.dtdt_size) {
9842                 case 0:
9843                 case sizeof (uint8_t):
9844                 case sizeof (uint16_t):
9845                 case sizeof (uint32_t):
9846                 case sizeof (uint64_t):
9847                         break;
9848
9849                 default:
9850                         err += efunc(dp->dtdo_len - 1, "bad return size\n");
9851                 }
9852         }
9853
9854         for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9855                 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9856                 dtrace_diftype_t *vt, *et;
9857                 uint_t id;
9858                 int ndx;
9859
9860                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9861                     v->dtdv_scope != DIFV_SCOPE_THREAD &&
9862                     v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9863                         err += efunc(i, "unrecognized variable scope %d\n",
9864                             v->dtdv_scope);
9865                         break;
9866                 }
9867
9868                 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9869                     v->dtdv_kind != DIFV_KIND_SCALAR) {
9870                         err += efunc(i, "unrecognized variable type %d\n",
9871                             v->dtdv_kind);
9872                         break;
9873                 }
9874
9875                 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9876                         err += efunc(i, "%d exceeds variable id limit\n", id);
9877                         break;
9878                 }
9879
9880                 if (id < DIF_VAR_OTHER_UBASE)
9881                         continue;
9882
9883                 /*
9884                  * For user-defined variables, we need to check that this
9885                  * definition is identical to any previous definition that we
9886                  * encountered.
9887                  */
9888                 ndx = id - DIF_VAR_OTHER_UBASE;
9889
9890                 switch (v->dtdv_scope) {
9891                 case DIFV_SCOPE_GLOBAL:
9892                         if (maxglobal == -1 || ndx > maxglobal)
9893                                 maxglobal = ndx;
9894
9895                         if (ndx < vstate->dtvs_nglobals) {
9896                                 dtrace_statvar_t *svar;
9897
9898                                 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9899                                         existing = &svar->dtsv_var;
9900                         }
9901
9902                         break;
9903
9904                 case DIFV_SCOPE_THREAD:
9905                         if (maxtlocal == -1 || ndx > maxtlocal)
9906                                 maxtlocal = ndx;
9907
9908                         if (ndx < vstate->dtvs_ntlocals)
9909                                 existing = &vstate->dtvs_tlocals[ndx];
9910                         break;
9911
9912                 case DIFV_SCOPE_LOCAL:
9913                         if (maxlocal == -1 || ndx > maxlocal)
9914                                 maxlocal = ndx;
9915                         if (ndx < vstate->dtvs_nlocals) {
9916                                 dtrace_statvar_t *svar;
9917
9918                                 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9919                                         existing = &svar->dtsv_var;
9920                         }
9921
9922                         break;
9923                 }
9924
9925                 vt = &v->dtdv_type;
9926
9927                 if (vt->dtdt_flags & DIF_TF_BYREF) {
9928                         if (vt->dtdt_size == 0) {
9929                                 err += efunc(i, "zero-sized variable\n");
9930                                 break;
9931                         }
9932
9933                         if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
9934                             v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
9935                             vt->dtdt_size > dtrace_statvar_maxsize) {
9936                                 err += efunc(i, "oversized by-ref static\n");
9937                                 break;
9938                         }
9939                 }
9940
9941                 if (existing == NULL || existing->dtdv_id == 0)
9942                         continue;
9943
9944                 ASSERT(existing->dtdv_id == v->dtdv_id);
9945                 ASSERT(existing->dtdv_scope == v->dtdv_scope);
9946
9947                 if (existing->dtdv_kind != v->dtdv_kind)
9948                         err += efunc(i, "%d changed variable kind\n", id);
9949
9950                 et = &existing->dtdv_type;
9951
9952                 if (vt->dtdt_flags != et->dtdt_flags) {
9953                         err += efunc(i, "%d changed variable type flags\n", id);
9954                         break;
9955                 }
9956
9957                 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9958                         err += efunc(i, "%d changed variable type size\n", id);
9959                         break;
9960                 }
9961         }
9962
9963         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9964                 dif_instr_t instr = dp->dtdo_buf[pc];
9965
9966                 uint_t v = DIF_INSTR_VAR(instr);
9967                 uint_t op = DIF_INSTR_OP(instr);
9968
9969                 switch (op) {
9970                 case DIF_OP_LDGS:
9971                 case DIF_OP_LDGAA:
9972                 case DIF_OP_STGS:
9973                 case DIF_OP_STGAA:
9974                         if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal))
9975                                 err += efunc(pc, "invalid variable %u\n", v);
9976                         break;
9977                 case DIF_OP_LDTS:
9978                 case DIF_OP_LDTAA:
9979                 case DIF_OP_STTS:
9980                 case DIF_OP_STTAA:
9981                         if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal))
9982                                 err += efunc(pc, "invalid variable %u\n", v);
9983                         break;
9984                 case DIF_OP_LDLS:
9985                 case DIF_OP_STLS:
9986                         if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal))
9987                                 err += efunc(pc, "invalid variable %u\n", v);
9988                         break;
9989                 default:
9990                         break;
9991                 }
9992         }
9993
9994         return (err);
9995 }
9996
9997 /*
9998  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9999  * are much more constrained than normal DIFOs.  Specifically, they may
10000  * not:
10001  *
10002  * 1. Make calls to subroutines other than copyin(), copyinstr() or
10003  *    miscellaneous string routines
10004  * 2. Access DTrace variables other than the args[] array, and the
10005  *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
10006  * 3. Have thread-local variables.
10007  * 4. Have dynamic variables.
10008  */
10009 static int
10010 dtrace_difo_validate_helper(dtrace_difo_t *dp)
10011 {
10012         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
10013         int err = 0;
10014         uint_t pc;
10015
10016         for (pc = 0; pc < dp->dtdo_len; pc++) {
10017                 dif_instr_t instr = dp->dtdo_buf[pc];
10018
10019                 uint_t v = DIF_INSTR_VAR(instr);
10020                 uint_t subr = DIF_INSTR_SUBR(instr);
10021                 uint_t op = DIF_INSTR_OP(instr);
10022
10023                 switch (op) {
10024                 case DIF_OP_OR:
10025                 case DIF_OP_XOR:
10026                 case DIF_OP_AND:
10027                 case DIF_OP_SLL:
10028                 case DIF_OP_SRL:
10029                 case DIF_OP_SRA:
10030                 case DIF_OP_SUB:
10031                 case DIF_OP_ADD:
10032                 case DIF_OP_MUL:
10033                 case DIF_OP_SDIV:
10034                 case DIF_OP_UDIV:
10035                 case DIF_OP_SREM:
10036                 case DIF_OP_UREM:
10037                 case DIF_OP_COPYS:
10038                 case DIF_OP_NOT:
10039                 case DIF_OP_MOV:
10040                 case DIF_OP_RLDSB:
10041                 case DIF_OP_RLDSH:
10042                 case DIF_OP_RLDSW:
10043                 case DIF_OP_RLDUB:
10044                 case DIF_OP_RLDUH:
10045                 case DIF_OP_RLDUW:
10046                 case DIF_OP_RLDX:
10047                 case DIF_OP_ULDSB:
10048                 case DIF_OP_ULDSH:
10049                 case DIF_OP_ULDSW:
10050                 case DIF_OP_ULDUB:
10051                 case DIF_OP_ULDUH:
10052                 case DIF_OP_ULDUW:
10053                 case DIF_OP_ULDX:
10054                 case DIF_OP_STB:
10055                 case DIF_OP_STH:
10056                 case DIF_OP_STW:
10057                 case DIF_OP_STX:
10058                 case DIF_OP_ALLOCS:
10059                 case DIF_OP_CMP:
10060                 case DIF_OP_SCMP:
10061                 case DIF_OP_TST:
10062                 case DIF_OP_BA:
10063                 case DIF_OP_BE:
10064                 case DIF_OP_BNE:
10065                 case DIF_OP_BG:
10066                 case DIF_OP_BGU:
10067                 case DIF_OP_BGE:
10068                 case DIF_OP_BGEU:
10069                 case DIF_OP_BL:
10070                 case DIF_OP_BLU:
10071                 case DIF_OP_BLE:
10072                 case DIF_OP_BLEU:
10073                 case DIF_OP_RET:
10074                 case DIF_OP_NOP:
10075                 case DIF_OP_POPTS:
10076                 case DIF_OP_FLUSHTS:
10077                 case DIF_OP_SETX:
10078                 case DIF_OP_SETS:
10079                 case DIF_OP_LDGA:
10080                 case DIF_OP_LDLS:
10081                 case DIF_OP_STGS:
10082                 case DIF_OP_STLS:
10083                 case DIF_OP_PUSHTR:
10084                 case DIF_OP_PUSHTV:
10085                         break;
10086
10087                 case DIF_OP_LDGS:
10088                         if (v >= DIF_VAR_OTHER_UBASE)
10089                                 break;
10090
10091                         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10092                                 break;
10093
10094                         if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10095                             v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10096                             v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10097                             v == DIF_VAR_UID || v == DIF_VAR_GID)
10098                                 break;
10099
10100                         err += efunc(pc, "illegal variable %u\n", v);
10101                         break;
10102
10103                 case DIF_OP_LDTA:
10104                 case DIF_OP_LDTS:
10105                 case DIF_OP_LDGAA:
10106                 case DIF_OP_LDTAA:
10107                         err += efunc(pc, "illegal dynamic variable load\n");
10108                         break;
10109
10110                 case DIF_OP_STTS:
10111                 case DIF_OP_STGAA:
10112                 case DIF_OP_STTAA:
10113                         err += efunc(pc, "illegal dynamic variable store\n");
10114                         break;
10115
10116                 case DIF_OP_CALL:
10117                         if (subr == DIF_SUBR_ALLOCA ||
10118                             subr == DIF_SUBR_BCOPY ||
10119                             subr == DIF_SUBR_COPYIN ||
10120                             subr == DIF_SUBR_COPYINTO ||
10121                             subr == DIF_SUBR_COPYINSTR ||
10122                             subr == DIF_SUBR_INDEX ||
10123                             subr == DIF_SUBR_INET_NTOA ||
10124                             subr == DIF_SUBR_INET_NTOA6 ||
10125                             subr == DIF_SUBR_INET_NTOP ||
10126                             subr == DIF_SUBR_JSON ||
10127                             subr == DIF_SUBR_LLTOSTR ||
10128                             subr == DIF_SUBR_STRTOLL ||
10129                             subr == DIF_SUBR_RINDEX ||
10130                             subr == DIF_SUBR_STRCHR ||
10131                             subr == DIF_SUBR_STRJOIN ||
10132                             subr == DIF_SUBR_STRRCHR ||
10133                             subr == DIF_SUBR_STRSTR ||
10134                             subr == DIF_SUBR_KDEBUG_TRACE ||
10135                             subr == DIF_SUBR_KDEBUG_TRACE_STRING ||
10136                             subr == DIF_SUBR_HTONS ||
10137                             subr == DIF_SUBR_HTONL ||
10138                             subr == DIF_SUBR_HTONLL ||
10139                             subr == DIF_SUBR_NTOHS ||
10140                             subr == DIF_SUBR_NTOHL ||
10141                             subr == DIF_SUBR_NTOHLL)
10142                                 break;
10143
10144                         err += efunc(pc, "invalid subr %u\n", subr);
10145                         break;
10146
10147                 default:
10148                         err += efunc(pc, "invalid opcode %u\n",
10149                             DIF_INSTR_OP(instr));
10150                 }
10151         }
10152
10153         return (err);
10154 }
10155
10156 /*
10157  * Returns 1 if the expression in the DIF object can be cached on a per-thread
10158  * basis; 0 if not.
10159  */
10160 static int
10161 dtrace_difo_cacheable(dtrace_difo_t *dp)
10162 {
10163         uint_t i;
10164
10165         if (dp == NULL)
10166                 return (0);
10167
10168         for (i = 0; i < dp->dtdo_varlen; i++) {
10169                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10170
10171                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10172                         continue;
10173
10174                 switch (v->dtdv_id) {
10175                 case DIF_VAR_CURTHREAD:
10176                 case DIF_VAR_PID:
10177                 case DIF_VAR_TID:
10178                 case DIF_VAR_EXECNAME:
10179                 case DIF_VAR_ZONENAME:
10180                         break;
10181
10182                 default:
10183                         return (0);
10184                 }
10185         }
10186
10187         /*
10188          * This DIF object may be cacheable.  Now we need to look for any
10189          * array loading instructions, any memory loading instructions, or
10190          * any stores to thread-local variables.
10191          */
10192         for (i = 0; i < dp->dtdo_len; i++) {
10193                 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10194
10195                 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10196                     (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10197                     (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10198                     op == DIF_OP_LDGA || op == DIF_OP_STTS)
10199                         return (0);
10200         }
10201
10202         return (1);
10203 }
10204
10205 static void
10206 dtrace_difo_hold(dtrace_difo_t *dp)
10207 {
10208         uint_t i;
10209
10210         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10211
10212         dp->dtdo_refcnt++;
10213         ASSERT(dp->dtdo_refcnt != 0);
10214
10215         /*
10216          * We need to check this DIF object for references to the variable
10217          * DIF_VAR_VTIMESTAMP.
10218          */
10219         for (i = 0; i < dp->dtdo_varlen; i++) {
10220                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10221
10222                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10223                         continue;
10224
10225                 if (dtrace_vtime_references++ == 0)
10226                         dtrace_vtime_enable();
10227         }
10228 }
10229
10230 /*
10231  * This routine calculates the dynamic variable chunksize for a given DIF
10232  * object.  The calculation is not fool-proof, and can probably be tricked by
10233  * malicious DIF -- but it works for all compiler-generated DIF.  Because this
10234  * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10235  * if a dynamic variable size exceeds the chunksize.
10236  */
10237 static void
10238 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10239 {
10240         uint64_t sval = 0;
10241         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10242         const dif_instr_t *text = dp->dtdo_buf;
10243         uint_t pc, srd = 0;
10244         uint_t ttop = 0;
10245         size_t size, ksize;
10246         uint_t id, i;
10247
10248         for (pc = 0; pc < dp->dtdo_len; pc++) {
10249                 dif_instr_t instr = text[pc];
10250                 uint_t op = DIF_INSTR_OP(instr);
10251                 uint_t rd = DIF_INSTR_RD(instr);
10252                 uint_t r1 = DIF_INSTR_R1(instr);
10253                 uint_t nkeys = 0;
10254                 uchar_t scope;
10255
10256                 dtrace_key_t *key = tupregs;
10257
10258                 switch (op) {
10259                 case DIF_OP_SETX:
10260                         sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10261                         srd = rd;
10262                         continue;
10263
10264                 case DIF_OP_STTS:
10265                         key = &tupregs[DIF_DTR_NREGS];
10266                         key[0].dttk_size = 0;
10267                         key[1].dttk_size = 0;
10268                         nkeys = 2;
10269                         scope = DIFV_SCOPE_THREAD;
10270                         break;
10271
10272                 case DIF_OP_STGAA:
10273                 case DIF_OP_STTAA:
10274                         nkeys = ttop;
10275
10276                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10277                                 key[nkeys++].dttk_size = 0;
10278
10279                         key[nkeys++].dttk_size = 0;
10280
10281                         if (op == DIF_OP_STTAA) {
10282                                 scope = DIFV_SCOPE_THREAD;
10283                         } else {
10284                                 scope = DIFV_SCOPE_GLOBAL;
10285                         }
10286
10287                         break;
10288
10289                 case DIF_OP_PUSHTR:
10290                         if (ttop == DIF_DTR_NREGS)
10291                                 return;
10292
10293                         if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10294                                 /*
10295                                  * If the register for the size of the "pushtr"
10296                                  * is %r0 (or the value is 0) and the type is
10297                                  * a string, we'll use the system-wide default
10298                                  * string size.
10299                                  */
10300                                 tupregs[ttop++].dttk_size =
10301                                     dtrace_strsize_default;
10302                         } else {
10303                                 if (srd == 0)
10304                                         return;
10305
10306                                 if (sval > LONG_MAX)
10307                                         return;
10308
10309                                 tupregs[ttop++].dttk_size = sval;
10310                         }
10311
10312                         break;
10313
10314                 case DIF_OP_PUSHTV:
10315                         if (ttop == DIF_DTR_NREGS)
10316                                 return;
10317
10318                         tupregs[ttop++].dttk_size = 0;
10319                         break;
10320
10321                 case DIF_OP_FLUSHTS:
10322                         ttop = 0;
10323                         break;
10324
10325                 case DIF_OP_POPTS:
10326                         if (ttop != 0)
10327                                 ttop--;
10328                         break;
10329                 }
10330
10331                 sval = 0;
10332                 srd = 0;
10333
10334                 if (nkeys == 0)
10335                         continue;
10336
10337                 /*
10338                  * We have a dynamic variable allocation; calculate its size.
10339                  */
10340                 for (ksize = 0, i = 0; i < nkeys; i++)
10341                         ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10342
10343                 size = sizeof (dtrace_dynvar_t);
10344                 size += sizeof (dtrace_key_t) * (nkeys - 1);
10345                 size += ksize;
10346
10347                 /*
10348                  * Now we need to determine the size of the stored data.
10349                  */
10350                 id = DIF_INSTR_VAR(instr);
10351
10352                 for (i = 0; i < dp->dtdo_varlen; i++) {
10353                         dtrace_difv_t *v = &dp->dtdo_vartab[i];
10354
10355                         if (v->dtdv_id == id && v->dtdv_scope == scope) {
10356                                 size += v->dtdv_type.dtdt_size;
10357                                 break;
10358                         }
10359                 }
10360
10361                 if (i == dp->dtdo_varlen)
10362                         return;
10363
10364                 /*
10365                  * We have the size.  If this is larger than the chunk size
10366                  * for our dynamic variable state, reset the chunk size.
10367                  */
10368                 size = P2ROUNDUP(size, sizeof (uint64_t));
10369
10370                 /*
10371                  * Before setting the chunk size, check that we're not going
10372                  * to set it to a negative value...
10373                  */
10374                 if (size > LONG_MAX)
10375                         return;
10376
10377                 /*
10378                  * ...and make certain that we didn't badly overflow.
10379                  */
10380                 if (size < ksize || size < sizeof (dtrace_dynvar_t))
10381                         return;
10382
10383                 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10384                         vstate->dtvs_dynvars.dtds_chunksize = size;
10385         }
10386 }
10387
10388 static void
10389 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10390 {
10391         int oldsvars, osz, nsz, otlocals, ntlocals;
10392         uint_t i, id;
10393
10394         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10395         ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10396
10397         for (i = 0; i < dp->dtdo_varlen; i++) {
10398                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10399                 dtrace_statvar_t *svar;
10400                 dtrace_statvar_t ***svarp = NULL;
10401                 size_t dsize = 0;
10402                 uint8_t scope = v->dtdv_scope;
10403                 int *np = (int *)NULL;
10404
10405                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10406                         continue;
10407
10408                 id -= DIF_VAR_OTHER_UBASE;
10409
10410                 switch (scope) {
10411                 case DIFV_SCOPE_THREAD:
10412                         while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
10413                                 dtrace_difv_t *tlocals;
10414
10415                                 if ((ntlocals = (otlocals << 1)) == 0)
10416                                         ntlocals = 1;
10417
10418                                 osz = otlocals * sizeof (dtrace_difv_t);
10419                                 nsz = ntlocals * sizeof (dtrace_difv_t);
10420
10421                                 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10422
10423                                 if (osz != 0) {
10424                                         bcopy(vstate->dtvs_tlocals,
10425                                             tlocals, osz);
10426                                         kmem_free(vstate->dtvs_tlocals, osz);
10427                                 }
10428
10429                                 vstate->dtvs_tlocals = tlocals;
10430                                 vstate->dtvs_ntlocals = ntlocals;
10431                         }
10432
10433                         vstate->dtvs_tlocals[id] = *v;
10434                         continue;
10435
10436                 case DIFV_SCOPE_LOCAL:
10437                         np = &vstate->dtvs_nlocals;
10438                         svarp = &vstate->dtvs_locals;
10439
10440                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10441                                 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
10442                                     sizeof (uint64_t));
10443                         else
10444                                 dsize = (int)NCPU * sizeof (uint64_t);
10445
10446                         break;
10447
10448                 case DIFV_SCOPE_GLOBAL:
10449                         np = &vstate->dtvs_nglobals;
10450                         svarp = &vstate->dtvs_globals;
10451
10452                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10453                                 dsize = v->dtdv_type.dtdt_size +
10454                                     sizeof (uint64_t);
10455
10456                         break;
10457
10458                 default:
10459                         ASSERT(0);
10460                 }
10461
10462                 while (id >= (uint_t)(oldsvars = *np)) {
10463                         dtrace_statvar_t **statics;
10464                         int newsvars, oldsize, newsize;
10465
10466                         if ((newsvars = (oldsvars << 1)) == 0)
10467                                 newsvars = 1;
10468
10469                         oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10470                         newsize = newsvars * sizeof (dtrace_statvar_t *);
10471
10472                         statics = kmem_zalloc(newsize, KM_SLEEP);
10473
10474                         if (oldsize != 0) {
10475                                 bcopy(*svarp, statics, oldsize);
10476                                 kmem_free(*svarp, oldsize);
10477                         }
10478
10479                         *svarp = statics;
10480                         *np = newsvars;
10481                 }
10482
10483                 if ((svar = (*svarp)[id]) == NULL) {
10484                         svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10485                         svar->dtsv_var = *v;
10486
10487                         if ((svar->dtsv_size = dsize) != 0) {
10488                                 svar->dtsv_data = (uint64_t)(uintptr_t)
10489                                     kmem_zalloc(dsize, KM_SLEEP);
10490                         }
10491
10492                         (*svarp)[id] = svar;
10493                 }
10494
10495                 svar->dtsv_refcnt++;
10496         }
10497
10498         dtrace_difo_chunksize(dp, vstate);
10499         dtrace_difo_hold(dp);
10500 }
10501
10502 static dtrace_difo_t *
10503 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10504 {
10505         dtrace_difo_t *new;
10506         size_t sz;
10507
10508         ASSERT(dp->dtdo_buf != NULL);
10509         ASSERT(dp->dtdo_refcnt != 0);
10510
10511         new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10512
10513         ASSERT(dp->dtdo_buf != NULL);
10514         sz = dp->dtdo_len * sizeof (dif_instr_t);
10515         new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10516         bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10517         new->dtdo_len = dp->dtdo_len;
10518
10519         if (dp->dtdo_strtab != NULL) {
10520                 ASSERT(dp->dtdo_strlen != 0);
10521                 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10522                 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10523                 new->dtdo_strlen = dp->dtdo_strlen;
10524         }
10525
10526         if (dp->dtdo_inttab != NULL) {
10527                 ASSERT(dp->dtdo_intlen != 0);
10528                 sz = dp->dtdo_intlen * sizeof (uint64_t);
10529                 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10530                 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10531                 new->dtdo_intlen = dp->dtdo_intlen;
10532         }
10533
10534         if (dp->dtdo_vartab != NULL) {
10535                 ASSERT(dp->dtdo_varlen != 0);
10536                 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10537                 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10538                 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10539                 new->dtdo_varlen = dp->dtdo_varlen;
10540         }
10541
10542         dtrace_difo_init(new, vstate);
10543         return (new);
10544 }
10545
10546 static void
10547 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10548 {
10549         uint_t i;
10550
10551         ASSERT(dp->dtdo_refcnt == 0);
10552
10553         for (i = 0; i < dp->dtdo_varlen; i++) {
10554                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10555                 dtrace_statvar_t *svar;
10556                 dtrace_statvar_t **svarp = NULL;
10557                 uint_t id;
10558                 uint8_t scope = v->dtdv_scope;
10559                 int *np = NULL;
10560
10561                 switch (scope) {
10562                 case DIFV_SCOPE_THREAD:
10563                         continue;
10564
10565                 case DIFV_SCOPE_LOCAL:
10566                         np = &vstate->dtvs_nlocals;
10567                         svarp = vstate->dtvs_locals;
10568                         break;
10569
10570                 case DIFV_SCOPE_GLOBAL:
10571                         np = &vstate->dtvs_nglobals;
10572                         svarp = vstate->dtvs_globals;
10573                         break;
10574
10575                 default:
10576                         ASSERT(0);
10577                 }
10578
10579                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10580                         continue;
10581
10582                 id -= DIF_VAR_OTHER_UBASE;
10583
10584                 ASSERT(id < (uint_t)*np);
10585
10586                 svar = svarp[id];
10587                 ASSERT(svar != NULL);
10588                 ASSERT(svar->dtsv_refcnt > 0);
10589
10590                 if (--svar->dtsv_refcnt > 0)
10591                         continue;
10592
10593                 if (svar->dtsv_size != 0) {
10594                         ASSERT(svar->dtsv_data != 0);
10595                         kmem_free((void *)(uintptr_t)svar->dtsv_data,
10596                             svar->dtsv_size);
10597                 }
10598
10599                 kmem_free(svar, sizeof (dtrace_statvar_t));
10600                 svarp[id] = NULL;
10601         }
10602
10603         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10604         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10605         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10606         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10607
10608         kmem_free(dp, sizeof (dtrace_difo_t));
10609 }
10610
10611 static void
10612 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10613 {
10614         uint_t i;
10615
10616         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10617         ASSERT(dp->dtdo_refcnt != 0);
10618
10619         for (i = 0; i < dp->dtdo_varlen; i++) {
10620                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10621
10622                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10623                         continue;
10624
10625                 ASSERT(dtrace_vtime_references > 0);
10626                 if (--dtrace_vtime_references == 0)
10627                         dtrace_vtime_disable();
10628         }
10629
10630         if (--dp->dtdo_refcnt == 0)
10631                 dtrace_difo_destroy(dp, vstate);
10632 }
10633
10634 /*
10635  * DTrace Format Functions
10636  */
10637
10638 static dtrace_format_t*
10639 dtrace_format_new(char *str)
10640 {
10641         dtrace_format_t *fmt = NULL;
10642         size_t bufsize = strlen(str) + 1;
10643
10644         fmt = kmem_zalloc(sizeof(*fmt) + bufsize, KM_SLEEP);
10645
10646         fmt->dtf_refcount = 1;
10647         (void) strlcpy(fmt->dtf_str, str, bufsize);
10648
10649         return fmt;
10650 }
10651
10652 static uint16_t
10653 dtrace_format_add(dtrace_state_t *state, char *str)
10654 {
10655         dtrace_format_t **new;
10656         uint16_t ndx;
10657
10658         for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10659                 if (state->dts_formats[ndx] == NULL) {
10660                         state->dts_formats[ndx] = dtrace_format_new(str);
10661                         return (ndx + 1);
10662                 }
10663                 else if (strcmp(state->dts_formats[ndx]->dtf_str, str) == 0) {
10664                         VERIFY(state->dts_formats[ndx]->dtf_refcount < UINT64_MAX);
10665                         state->dts_formats[ndx]->dtf_refcount++;
10666                         return (ndx + 1);
10667                 }
10668         }
10669
10670         if (state->dts_nformats == USHRT_MAX) {
10671                 /*
10672                  * This is only likely if a denial-of-service attack is being
10673                  * attempted.  As such, it's okay to fail silently here.
10674                  */
10675                 return (0);
10676         }
10677
10678         /*
10679          * For simplicity, we always resize the formats array to be exactly the
10680          * number of formats.
10681          */
10682         ndx = state->dts_nformats++;
10683         new = kmem_alloc((ndx + 1) * sizeof (*state->dts_formats), KM_SLEEP);
10684
10685         if (state->dts_formats != NULL) {
10686                 ASSERT(ndx != 0);
10687                 bcopy(state->dts_formats, new, ndx * sizeof (*state->dts_formats));
10688                 kmem_free(state->dts_formats, ndx * sizeof (*state->dts_formats));
10689         }
10690
10691         state->dts_formats = new;
10692         state->dts_formats[ndx] = dtrace_format_new(str);
10693
10694         return (ndx + 1);
10695 }
10696
10697 static void
10698 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10699 {
10700         dtrace_format_t *fmt;
10701
10702         ASSERT(state->dts_formats != NULL);
10703         ASSERT(format <= state->dts_nformats);
10704
10705         fmt = state->dts_formats[format - 1];
10706
10707         ASSERT(fmt != NULL);
10708         VERIFY(fmt->dtf_refcount > 0);
10709
10710         fmt->dtf_refcount--;
10711
10712         if (fmt->dtf_refcount == 0) {
10713                 kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10714                 state->dts_formats[format - 1] = NULL;
10715         }
10716 }
10717
10718 static void
10719 dtrace_format_destroy(dtrace_state_t *state)
10720 {
10721         int i;
10722
10723         if (state->dts_nformats == 0) {
10724                 ASSERT(state->dts_formats == NULL);
10725                 return;
10726         }
10727
10728         ASSERT(state->dts_formats != NULL);
10729
10730         for (i = 0; i < state->dts_nformats; i++) {
10731                 dtrace_format_t *fmt = state->dts_formats[i];
10732
10733                 if (fmt == NULL)
10734                         continue;
10735
10736                 kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10737         }
10738
10739         kmem_free(state->dts_formats, state->dts_nformats * sizeof (*state->dts_formats));
10740         state->dts_nformats = 0;
10741         state->dts_formats = NULL;
10742 }
10743
10744 /*
10745  * DTrace Predicate Functions
10746  */
10747 static dtrace_predicate_t *
10748 dtrace_predicate_create(dtrace_difo_t *dp)
10749 {
10750         dtrace_predicate_t *pred;
10751
10752         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10753         ASSERT(dp->dtdo_refcnt != 0);
10754
10755         pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10756         pred->dtp_difo = dp;
10757         pred->dtp_refcnt = 1;
10758
10759         if (!dtrace_difo_cacheable(dp))
10760                 return (pred);
10761
10762         if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10763                 /*
10764                  * This is only theoretically possible -- we have had 2^32
10765                  * cacheable predicates on this machine.  We cannot allow any
10766                  * more predicates to become cacheable:  as unlikely as it is,
10767                  * there may be a thread caching a (now stale) predicate cache
10768                  * ID. (N.B.: the temptation is being successfully resisted to
10769                  * have this cmn_err() "Holy shit -- we executed this code!")
10770                  */
10771                 return (pred);
10772         }
10773
10774         pred->dtp_cacheid = dtrace_predcache_id++;
10775
10776         return (pred);
10777 }
10778
10779 static void
10780 dtrace_predicate_hold(dtrace_predicate_t *pred)
10781 {
10782         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10783         ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10784         ASSERT(pred->dtp_refcnt > 0);
10785
10786         pred->dtp_refcnt++;
10787 }
10788
10789 static void
10790 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10791 {
10792         dtrace_difo_t *dp = pred->dtp_difo;
10793 #pragma unused(dp) /* __APPLE__ */
10794
10795         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10796         ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10797         ASSERT(pred->dtp_refcnt > 0);
10798
10799         if (--pred->dtp_refcnt == 0) {
10800                 dtrace_difo_release(pred->dtp_difo, vstate);
10801                 kmem_free(pred, sizeof (dtrace_predicate_t));
10802         }
10803 }
10804
10805 /*
10806  * DTrace Action Description Functions
10807  */
10808 static dtrace_actdesc_t *
10809 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10810     uint64_t uarg, uint64_t arg)
10811 {
10812         dtrace_actdesc_t *act;
10813
10814         ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != 0 &&
10815             arg >= KERNELBASE) || (arg == 0 && kind == DTRACEACT_PRINTA));
10816
10817         act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10818         act->dtad_kind = kind;
10819         act->dtad_ntuple = ntuple;
10820         act->dtad_uarg = uarg;
10821         act->dtad_arg = arg;
10822         act->dtad_refcnt = 1;
10823
10824         return (act);
10825 }
10826
10827 static void
10828 dtrace_actdesc_hold(dtrace_actdesc_t *act)
10829 {
10830         ASSERT(act->dtad_refcnt >= 1);
10831         act->dtad_refcnt++;
10832 }
10833
10834 static void
10835 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10836 {
10837         dtrace_actkind_t kind = act->dtad_kind;
10838         dtrace_difo_t *dp;
10839
10840         ASSERT(act->dtad_refcnt >= 1);
10841
10842         if (--act->dtad_refcnt != 0)
10843                 return;
10844
10845         if ((dp = act->dtad_difo) != NULL)
10846                 dtrace_difo_release(dp, vstate);
10847
10848         if (DTRACEACT_ISPRINTFLIKE(kind)) {
10849                 char *str = (char *)(uintptr_t)act->dtad_arg;
10850
10851                 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10852                     (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10853
10854                 if (str != NULL)
10855                         kmem_free(str, strlen(str) + 1);
10856         }
10857
10858         kmem_free(act, sizeof (dtrace_actdesc_t));
10859 }
10860
10861 /*
10862  * DTrace ECB Functions
10863  */
10864 static dtrace_ecb_t *
10865 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10866 {
10867         dtrace_ecb_t *ecb;
10868         dtrace_epid_t epid;
10869
10870         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10871
10872         ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10873         ecb->dte_predicate = NULL;
10874         ecb->dte_probe = probe;
10875
10876         /*
10877          * The default size is the size of the default action: recording
10878          * the header.
10879          */
10880         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10881         ecb->dte_alignment = sizeof (dtrace_epid_t);
10882
10883         epid = state->dts_epid++;
10884
10885         if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
10886                 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10887                 int necbs = state->dts_necbs << 1;
10888
10889                 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
10890
10891                 if (necbs == 0) {
10892                         ASSERT(oecbs == NULL);
10893                         necbs = 1;
10894                 }
10895
10896                 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10897
10898                 if (oecbs != NULL)
10899                         bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10900
10901                 dtrace_membar_producer();
10902                 state->dts_ecbs = ecbs;
10903
10904                 if (oecbs != NULL) {
10905                         /*
10906                          * If this state is active, we must dtrace_sync()
10907                          * before we can free the old dts_ecbs array:  we're
10908                          * coming in hot, and there may be active ring
10909                          * buffer processing (which indexes into the dts_ecbs
10910                          * array) on another CPU.
10911                          */
10912                         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10913                                 dtrace_sync();
10914
10915                         kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10916                 }
10917
10918                 dtrace_membar_producer();
10919                 state->dts_necbs = necbs;
10920         }
10921
10922         ecb->dte_state = state;
10923
10924         ASSERT(state->dts_ecbs[epid - 1] == NULL);
10925         dtrace_membar_producer();
10926         state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10927
10928         return (ecb);
10929 }
10930
10931 static int
10932 dtrace_ecb_enable(dtrace_ecb_t *ecb)
10933 {
10934         dtrace_probe_t *probe = ecb->dte_probe;
10935
10936         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
10937         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10938         ASSERT(ecb->dte_next == NULL);
10939
10940         if (probe == NULL) {
10941                 /*
10942                  * This is the NULL probe -- there's nothing to do.
10943                  */
10944             return(0);
10945         }
10946
10947         probe->dtpr_provider->dtpv_ecb_count++;
10948         if (probe->dtpr_ecb == NULL) {
10949                 dtrace_provider_t *prov = probe->dtpr_provider;
10950
10951                 /*
10952                  * We're the first ECB on this probe.
10953                  */
10954                 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10955
10956                 if (ecb->dte_predicate != NULL)
10957                         probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10958
10959                 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10960                     probe->dtpr_id, probe->dtpr_arg));
10961         } else {
10962                 /*
10963                  * This probe is already active.  Swing the last pointer to
10964                  * point to the new ECB, and issue a dtrace_sync() to assure
10965                  * that all CPUs have seen the change.
10966                  */
10967                 ASSERT(probe->dtpr_ecb_last != NULL);
10968                 probe->dtpr_ecb_last->dte_next = ecb;
10969                 probe->dtpr_ecb_last = ecb;
10970                 probe->dtpr_predcache = 0;
10971
10972                 dtrace_sync();
10973                 return(0);
10974         }
10975 }
10976
10977 static int
10978 dtrace_ecb_resize(dtrace_ecb_t *ecb)
10979 {
10980         dtrace_action_t *act;
10981         uint32_t curneeded = UINT32_MAX;
10982         uint32_t aggbase = UINT32_MAX;
10983
10984         /*
10985          * If we record anything, we always record the dtrace_rechdr_t.  (And
10986          * we always record it first.)
10987          */
10988         ecb->dte_size = sizeof (dtrace_rechdr_t);
10989         ecb->dte_alignment = sizeof (dtrace_epid_t);
10990
10991         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10992                 dtrace_recdesc_t *rec = &act->dta_rec;
10993                 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10994
10995                 ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
10996
10997                 if (DTRACEACT_ISAGG(act->dta_kind)) {
10998                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10999
11000                         ASSERT(rec->dtrd_size != 0);
11001                         ASSERT(agg->dtag_first != NULL);
11002                         ASSERT(act->dta_prev->dta_intuple);
11003                         ASSERT(aggbase != UINT32_MAX);
11004                         ASSERT(curneeded != UINT32_MAX);
11005
11006                         agg->dtag_base = aggbase;
11007                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11008                         rec->dtrd_offset = curneeded;
11009                         if (curneeded + rec->dtrd_size < curneeded)
11010                                 return (EINVAL);
11011                         curneeded += rec->dtrd_size;
11012                         ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
11013
11014                         aggbase = UINT32_MAX;
11015                         curneeded = UINT32_MAX;
11016                 } else if (act->dta_intuple) {
11017                         if (curneeded == UINT32_MAX) {
11018                                 /*
11019                                  * This is the first record in a tuple.  Align
11020                                  * curneeded to be at offset 4 in an 8-byte
11021                                  * aligned block.
11022                                  */
11023                                 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11024                                 ASSERT(aggbase == UINT32_MAX);
11025
11026                                 curneeded = P2PHASEUP(ecb->dte_size,
11027                                     sizeof (uint64_t), sizeof (dtrace_aggid_t));
11028
11029                                 aggbase = curneeded - sizeof (dtrace_aggid_t);
11030                                 ASSERT(IS_P2ALIGNED(aggbase,
11031                                     sizeof (uint64_t)));
11032                         }
11033
11034                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11035                         rec->dtrd_offset = curneeded;
11036                         curneeded += rec->dtrd_size;
11037                         if (curneeded + rec->dtrd_size < curneeded)
11038                                 return (EINVAL);
11039                 } else {
11040                         /* tuples must be followed by an aggregation */
11041                         ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11042                         ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
11043                         rec->dtrd_offset = ecb->dte_size;
11044                         if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
11045                                 return (EINVAL);
11046                         ecb->dte_size += rec->dtrd_size;
11047                         ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
11048                 }
11049         }
11050
11051         if ((act = ecb->dte_action) != NULL &&
11052             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
11053             ecb->dte_size == sizeof (dtrace_rechdr_t)) {
11054                 /*
11055                  * If the size is still sizeof (dtrace_rechdr_t), then all
11056                  * actions store no data; set the size to 0.
11057                  */
11058                 ecb->dte_size = 0;
11059         }
11060
11061         ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
11062         ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
11063         ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
11064         return (0);
11065 }
11066
11067 static dtrace_action_t *
11068 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11069 {
11070         dtrace_aggregation_t *agg;
11071         size_t size = sizeof (uint64_t);
11072         int ntuple = desc->dtad_ntuple;
11073         dtrace_action_t *act;
11074         dtrace_recdesc_t *frec;
11075         dtrace_aggid_t aggid;
11076         dtrace_state_t *state = ecb->dte_state;
11077
11078         agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
11079         agg->dtag_ecb = ecb;
11080
11081         ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
11082
11083         switch (desc->dtad_kind) {
11084         case DTRACEAGG_MIN:
11085                 agg->dtag_initial = INT64_MAX;
11086                 agg->dtag_aggregate = dtrace_aggregate_min;
11087                 break;
11088
11089         case DTRACEAGG_MAX:
11090                 agg->dtag_initial = INT64_MIN;
11091                 agg->dtag_aggregate = dtrace_aggregate_max;
11092                 break;
11093
11094         case DTRACEAGG_COUNT:
11095                 agg->dtag_aggregate = dtrace_aggregate_count;
11096                 break;
11097
11098         case DTRACEAGG_QUANTIZE:
11099                 agg->dtag_aggregate = dtrace_aggregate_quantize;
11100                 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11101                     sizeof (uint64_t);
11102                 break;
11103
11104         case DTRACEAGG_LQUANTIZE: {
11105                 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11106                 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11107
11108                 agg->dtag_initial = desc->dtad_arg;
11109                 agg->dtag_aggregate = dtrace_aggregate_lquantize;
11110
11111                 if (step == 0 || levels == 0)
11112                         goto err;
11113
11114                 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11115                 break;
11116         }
11117
11118         case DTRACEAGG_LLQUANTIZE: {
11119                 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11120                 uint16_t low    = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11121                 uint16_t high   = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11122                 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11123                 int64_t v;
11124
11125                 agg->dtag_initial = desc->dtad_arg;
11126                 agg->dtag_aggregate = dtrace_aggregate_llquantize;
11127
11128                 if (factor < 2 || low >= high || nsteps < factor)
11129                         goto err;
11130
11131                 /*
11132                  * Now check that the number of steps evenly divides a power
11133                  * of the factor.  (This assures both integer bucket size and
11134                  * linearity within each magnitude.)
11135                  */
11136                 for (v = factor; v < nsteps; v *= factor)
11137                         continue;
11138
11139                 if ((v % nsteps) || (nsteps % factor))
11140                         goto err;
11141
11142                 size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11143                 break;
11144   }
11145
11146         case DTRACEAGG_AVG:
11147                 agg->dtag_aggregate = dtrace_aggregate_avg;
11148                 size = sizeof (uint64_t) * 2;
11149                 break;
11150
11151         case DTRACEAGG_STDDEV:
11152                 agg->dtag_aggregate = dtrace_aggregate_stddev;
11153                 size = sizeof (uint64_t) * 4;
11154                 break;
11155
11156         case DTRACEAGG_SUM:
11157                 agg->dtag_aggregate = dtrace_aggregate_sum;
11158                 break;
11159
11160         default:
11161                 goto err;
11162         }
11163
11164         agg->dtag_action.dta_rec.dtrd_size = size;
11165
11166         if (ntuple == 0)
11167                 goto err;
11168
11169         /*
11170          * We must make sure that we have enough actions for the n-tuple.
11171          */
11172         for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11173                 if (DTRACEACT_ISAGG(act->dta_kind))
11174                         break;
11175
11176                 if (--ntuple == 0) {
11177                         /*
11178                          * This is the action with which our n-tuple begins.
11179                          */
11180                         agg->dtag_first = act;
11181                         goto success;
11182                 }
11183         }
11184
11185         /*
11186          * This n-tuple is short by ntuple elements.  Return failure.
11187          */
11188         ASSERT(ntuple != 0);
11189 err:
11190         kmem_free(agg, sizeof (dtrace_aggregation_t));
11191         return (NULL);
11192
11193 success:
11194         /*
11195          * If the last action in the tuple has a size of zero, it's actually
11196          * an expression argument for the aggregating action.
11197          */
11198         ASSERT(ecb->dte_action_last != NULL);
11199         act = ecb->dte_action_last;
11200
11201         if (act->dta_kind == DTRACEACT_DIFEXPR) {
11202                 ASSERT(act->dta_difo != NULL);
11203
11204                 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11205                         agg->dtag_hasarg = 1;
11206         }
11207
11208         /*
11209          * We need to allocate an id for this aggregation.
11210          */
11211         aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11212             VM_BESTFIT | VM_SLEEP);
11213
11214         if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
11215                 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11216                 dtrace_aggregation_t **aggs;
11217                 int naggs = state->dts_naggregations << 1;
11218                 int onaggs = state->dts_naggregations;
11219
11220                 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
11221
11222                 if (naggs == 0) {
11223                         ASSERT(oaggs == NULL);
11224                         naggs = 1;
11225                 }
11226
11227                 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11228
11229                 if (oaggs != NULL) {
11230                         bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11231                         kmem_free(oaggs, onaggs * sizeof (*aggs));
11232                 }
11233
11234                 state->dts_aggregations = aggs;
11235                 state->dts_naggregations = naggs;
11236         }
11237
11238         ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11239         state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11240
11241         frec = &agg->dtag_first->dta_rec;
11242         if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11243                 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11244
11245         for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11246                 ASSERT(!act->dta_intuple);
11247                 act->dta_intuple = 1;
11248         }
11249
11250         return (&agg->dtag_action);
11251 }
11252
11253 static void
11254 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11255 {
11256         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11257         dtrace_state_t *state = ecb->dte_state;
11258         dtrace_aggid_t aggid = agg->dtag_id;
11259
11260         ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11261         vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11262
11263         ASSERT(state->dts_aggregations[aggid - 1] == agg);
11264         state->dts_aggregations[aggid - 1] = NULL;
11265
11266         kmem_free(agg, sizeof (dtrace_aggregation_t));
11267 }
11268
11269 static int
11270 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11271 {
11272         dtrace_action_t *action, *last;
11273         dtrace_difo_t *dp = desc->dtad_difo;
11274         uint32_t size = 0, align = sizeof (uint8_t), mask;
11275         uint16_t format = 0;
11276         dtrace_recdesc_t *rec;
11277         dtrace_state_t *state = ecb->dte_state;
11278         dtrace_optval_t *opt = state->dts_options;
11279         dtrace_optval_t nframes=0, strsize;
11280         uint64_t arg = desc->dtad_arg;
11281
11282         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11283         ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11284
11285         if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11286                 /*
11287                  * If this is an aggregating action, there must be neither
11288                  * a speculate nor a commit on the action chain.
11289                  */
11290                 dtrace_action_t *act;
11291
11292                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11293                         if (act->dta_kind == DTRACEACT_COMMIT)
11294                                 return (EINVAL);
11295
11296                         if (act->dta_kind == DTRACEACT_SPECULATE)
11297                                 return (EINVAL);
11298                 }
11299
11300                 action = dtrace_ecb_aggregation_create(ecb, desc);
11301
11302                 if (action == NULL)
11303                         return (EINVAL);
11304         } else {
11305                 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11306                     (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11307                     dp != NULL && dp->dtdo_destructive)) {
11308                         state->dts_destructive = 1;
11309                 }
11310
11311                 switch (desc->dtad_kind) {
11312                 case DTRACEACT_PRINTF:
11313                 case DTRACEACT_PRINTA:
11314                 case DTRACEACT_SYSTEM:
11315                 case DTRACEACT_FREOPEN:
11316                 case DTRACEACT_DIFEXPR:
11317                         /*
11318                          * We know that our arg is a string -- turn it into a
11319                          * format.
11320                          */
11321                         if (arg == 0) {
11322                                 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11323                                        desc->dtad_kind == DTRACEACT_DIFEXPR);
11324                                 format = 0;
11325                         } else {
11326                                 ASSERT(arg != 0);
11327                                 ASSERT(arg > KERNELBASE);
11328                                 format = dtrace_format_add(state,
11329                                     (char *)(uintptr_t)arg);
11330                         }
11331
11332                         /*FALLTHROUGH*/
11333                 case DTRACEACT_LIBACT:
11334                 case DTRACEACT_TRACEMEM:
11335                 case DTRACEACT_TRACEMEM_DYNSIZE:
11336                 case DTRACEACT_APPLEBINARY:     /* __APPLE__ */
11337                         if (dp == NULL)
11338                                 return (EINVAL);
11339
11340                         if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11341                                 break;
11342
11343                         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11344                                 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11345                                         return (EINVAL);
11346
11347                                 size = opt[DTRACEOPT_STRSIZE];
11348                         }
11349
11350                         break;
11351
11352                 case DTRACEACT_STACK:
11353                         if ((nframes = arg) == 0) {
11354                                 nframes = opt[DTRACEOPT_STACKFRAMES];
11355                                 ASSERT(nframes > 0);
11356                                 arg = nframes;
11357                         }
11358
11359                         size = nframes * sizeof (pc_t);
11360                         break;
11361
11362                 case DTRACEACT_JSTACK:
11363                         if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11364                                 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11365
11366                         if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11367                                 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11368
11369                         arg = DTRACE_USTACK_ARG(nframes, strsize);
11370
11371                         /*FALLTHROUGH*/
11372                 case DTRACEACT_USTACK:
11373                         if (desc->dtad_kind != DTRACEACT_JSTACK &&
11374                             (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11375                                 strsize = DTRACE_USTACK_STRSIZE(arg);
11376                                 nframes = opt[DTRACEOPT_USTACKFRAMES];
11377                                 ASSERT(nframes > 0);
11378                                 arg = DTRACE_USTACK_ARG(nframes, strsize);
11379                         }
11380
11381                         /*
11382                          * Save a slot for the pid.
11383                          */
11384                         size = (nframes + 1) * sizeof (uint64_t);
11385                         size += DTRACE_USTACK_STRSIZE(arg);
11386                         size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11387
11388                         break;
11389
11390                 case DTRACEACT_SYM:
11391                 case DTRACEACT_MOD:
11392                         if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11393                             sizeof (uint64_t)) ||
11394                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11395                                 return (EINVAL);
11396                         break;
11397
11398                 case DTRACEACT_USYM:
11399                 case DTRACEACT_UMOD:
11400                 case DTRACEACT_UADDR:
11401                         if (dp == NULL ||
11402                             (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11403                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11404                                 return (EINVAL);
11405
11406                         /*
11407                          * We have a slot for the pid, plus a slot for the
11408                          * argument.  To keep things simple (aligned with
11409                          * bitness-neutral sizing), we store each as a 64-bit
11410                          * quantity.
11411                          */
11412                         size = 2 * sizeof (uint64_t);
11413                         break;
11414
11415                 case DTRACEACT_STOP:
11416                 case DTRACEACT_BREAKPOINT:
11417                 case DTRACEACT_PANIC:
11418                         break;
11419
11420                 case DTRACEACT_CHILL:
11421                 case DTRACEACT_DISCARD:
11422                 case DTRACEACT_RAISE:
11423                 case DTRACEACT_PIDRESUME:       /* __APPLE__ */
11424                         if (dp == NULL)
11425                                 return (EINVAL);
11426                         break;
11427
11428                 case DTRACEACT_EXIT:
11429                         if (dp == NULL ||
11430                             (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11431                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11432                                 return (EINVAL);
11433                         break;
11434
11435                 case DTRACEACT_SPECULATE:
11436                         if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11437                                 return (EINVAL);
11438
11439                         if (dp == NULL)
11440                                 return (EINVAL);
11441
11442                         state->dts_speculates = 1;
11443                         break;
11444
11445                 case DTRACEACT_COMMIT: {
11446                         dtrace_action_t *act = ecb->dte_action;
11447
11448                         for (; act != NULL; act = act->dta_next) {
11449                                 if (act->dta_kind == DTRACEACT_COMMIT)
11450                                         return (EINVAL);
11451                         }
11452
11453                         if (dp == NULL)
11454                                 return (EINVAL);
11455                         break;
11456                 }
11457
11458                 default:
11459                         return (EINVAL);
11460                 }
11461
11462                 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11463                         /*
11464                          * If this is a data-storing action or a speculate,
11465                          * we must be sure that there isn't a commit on the
11466                          * action chain.
11467                          */
11468                         dtrace_action_t *act = ecb->dte_action;
11469
11470                         for (; act != NULL; act = act->dta_next) {
11471                                 if (act->dta_kind == DTRACEACT_COMMIT)
11472                                         return (EINVAL);
11473                         }
11474                 }
11475
11476                 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11477                 action->dta_rec.dtrd_size = size;
11478         }
11479
11480         action->dta_refcnt = 1;
11481         rec = &action->dta_rec;
11482         size = rec->dtrd_size;
11483
11484         for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11485                 if (!(size & mask)) {
11486                         align = mask + 1;
11487                         break;
11488                 }
11489         }
11490
11491         action->dta_kind = desc->dtad_kind;
11492
11493         if ((action->dta_difo = dp) != NULL)
11494                 dtrace_difo_hold(dp);
11495
11496         rec->dtrd_action = action->dta_kind;
11497         rec->dtrd_arg = arg;
11498         rec->dtrd_uarg = desc->dtad_uarg;
11499         rec->dtrd_alignment = (uint16_t)align;
11500         rec->dtrd_format = format;
11501
11502         if ((last = ecb->dte_action_last) != NULL) {
11503                 ASSERT(ecb->dte_action != NULL);
11504                 action->dta_prev = last;
11505                 last->dta_next = action;
11506         } else {
11507                 ASSERT(ecb->dte_action == NULL);
11508                 ecb->dte_action = action;
11509         }
11510
11511         ecb->dte_action_last = action;
11512
11513         return (0);
11514 }
11515
11516 static void
11517 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11518 {
11519         dtrace_action_t *act = ecb->dte_action, *next;
11520         dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11521         dtrace_difo_t *dp;
11522         uint16_t format;
11523
11524         if (act != NULL && act->dta_refcnt > 1) {
11525                 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11526                 act->dta_refcnt--;
11527         } else {
11528                 for (; act != NULL; act = next) {
11529                         next = act->dta_next;
11530                         ASSERT(next != NULL || act == ecb->dte_action_last);
11531                         ASSERT(act->dta_refcnt == 1);
11532
11533                         if ((format = act->dta_rec.dtrd_format) != 0)
11534                                 dtrace_format_remove(ecb->dte_state, format);
11535
11536                         if ((dp = act->dta_difo) != NULL)
11537                                 dtrace_difo_release(dp, vstate);
11538
11539                         if (DTRACEACT_ISAGG(act->dta_kind)) {
11540                                 dtrace_ecb_aggregation_destroy(ecb, act);
11541                         } else {
11542                                 kmem_free(act, sizeof (dtrace_action_t));
11543                         }
11544                 }
11545         }
11546
11547         ecb->dte_action = NULL;
11548         ecb->dte_action_last = NULL;
11549         ecb->dte_size = 0;
11550 }
11551
11552 static void
11553 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11554 {
11555         /*
11556          * We disable the ECB by removing it from its probe.
11557          */
11558         dtrace_ecb_t *pecb, *prev = NULL;
11559         dtrace_probe_t *probe = ecb->dte_probe;
11560
11561         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11562
11563         if (probe == NULL) {
11564                 /*
11565                  * This is the NULL probe; there is nothing to disable.
11566                  */
11567                 return;
11568         }
11569
11570         for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11571                 if (pecb == ecb)
11572                         break;
11573                 prev = pecb;
11574         }
11575
11576         ASSERT(pecb != NULL);
11577
11578         if (prev == NULL) {
11579                 probe->dtpr_ecb = ecb->dte_next;
11580         } else {
11581                 prev->dte_next = ecb->dte_next;
11582         }
11583
11584         if (ecb == probe->dtpr_ecb_last) {
11585                 ASSERT(ecb->dte_next == NULL);
11586                 probe->dtpr_ecb_last = prev;
11587         }
11588
11589         probe->dtpr_provider->dtpv_ecb_count--;
11590         /*
11591          * The ECB has been disconnected from the probe; now sync to assure
11592          * that all CPUs have seen the change before returning.
11593          */
11594         dtrace_sync();
11595
11596         if (probe->dtpr_ecb == NULL) {
11597                 /*
11598                  * That was the last ECB on the probe; clear the predicate
11599                  * cache ID for the probe, disable it and sync one more time
11600                  * to assure that we'll never hit it again.
11601                  */
11602                 dtrace_provider_t *prov = probe->dtpr_provider;
11603
11604                 ASSERT(ecb->dte_next == NULL);
11605                 ASSERT(probe->dtpr_ecb_last == NULL);
11606                 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11607                 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11608                     probe->dtpr_id, probe->dtpr_arg);
11609                 dtrace_sync();
11610         } else {
11611                 /*
11612                  * There is at least one ECB remaining on the probe.  If there
11613                  * is _exactly_ one, set the probe's predicate cache ID to be
11614                  * the predicate cache ID of the remaining ECB.
11615                  */
11616                 ASSERT(probe->dtpr_ecb_last != NULL);
11617                 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11618
11619                 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11620                         dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11621
11622                         ASSERT(probe->dtpr_ecb->dte_next == NULL);
11623
11624                         if (p != NULL)
11625                                 probe->dtpr_predcache = p->dtp_cacheid;
11626                 }
11627
11628                 ecb->dte_next = NULL;
11629         }
11630 }
11631
11632 static void
11633 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11634 {
11635         dtrace_state_t *state = ecb->dte_state;
11636         dtrace_vstate_t *vstate = &state->dts_vstate;
11637         dtrace_predicate_t *pred;
11638         dtrace_epid_t epid = ecb->dte_epid;
11639
11640         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11641         ASSERT(ecb->dte_next == NULL);
11642         ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11643
11644         if ((pred = ecb->dte_predicate) != NULL)
11645                 dtrace_predicate_release(pred, vstate);
11646
11647         dtrace_ecb_action_remove(ecb);
11648
11649         ASSERT(state->dts_ecbs[epid - 1] == ecb);
11650         state->dts_ecbs[epid - 1] = NULL;
11651
11652         kmem_free(ecb, sizeof (dtrace_ecb_t));
11653 }
11654
11655 static dtrace_ecb_t *
11656 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11657     dtrace_enabling_t *enab)
11658 {
11659         dtrace_ecb_t *ecb;
11660         dtrace_predicate_t *pred;
11661         dtrace_actdesc_t *act;
11662         dtrace_provider_t *prov;
11663         dtrace_ecbdesc_t *desc = enab->dten_current;
11664
11665         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11666         ASSERT(state != NULL);
11667
11668         ecb = dtrace_ecb_add(state, probe);
11669         ecb->dte_uarg = desc->dted_uarg;
11670
11671         if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11672                 dtrace_predicate_hold(pred);
11673                 ecb->dte_predicate = pred;
11674         }
11675
11676         if (probe != NULL) {
11677                 /*
11678                  * If the provider shows more leg than the consumer is old
11679                  * enough to see, we need to enable the appropriate implicit
11680                  * predicate bits to prevent the ecb from activating at
11681                  * revealing times.
11682                  *
11683                  * Providers specifying DTRACE_PRIV_USER at register time
11684                  * are stating that they need the /proc-style privilege
11685                  * model to be enforced, and this is what DTRACE_COND_OWNER
11686                  * and DTRACE_COND_ZONEOWNER will then do at probe time.
11687                  */
11688                 prov = probe->dtpr_provider;
11689                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11690                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11691                         ecb->dte_cond |= DTRACE_COND_OWNER;
11692
11693                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11694                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11695                         ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11696
11697                 /*
11698                  * If the provider shows us kernel innards and the user
11699                  * is lacking sufficient privilege, enable the
11700                  * DTRACE_COND_USERMODE implicit predicate.
11701                  */
11702                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11703                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11704                         ecb->dte_cond |= DTRACE_COND_USERMODE;
11705         }
11706
11707         if (dtrace_ecb_create_cache != NULL) {
11708                 /*
11709                  * If we have a cached ecb, we'll use its action list instead
11710                  * of creating our own (saving both time and space).
11711                  */
11712                 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11713                 dtrace_action_t *act_if = cached->dte_action;
11714
11715                 if (act_if != NULL) {
11716                         ASSERT(act_if->dta_refcnt > 0);
11717                         act_if->dta_refcnt++;
11718                         ecb->dte_action = act_if;
11719                         ecb->dte_action_last = cached->dte_action_last;
11720                         ecb->dte_needed = cached->dte_needed;
11721                         ecb->dte_size = cached->dte_size;
11722                         ecb->dte_alignment = cached->dte_alignment;
11723                 }
11724
11725                 return (ecb);
11726         }
11727
11728         for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11729                 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11730                         dtrace_ecb_destroy(ecb);
11731                         return (NULL);
11732                 }
11733         }
11734
11735         if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
11736                 dtrace_ecb_destroy(ecb);
11737                 return (NULL);
11738         }
11739
11740         return (dtrace_ecb_create_cache = ecb);
11741 }
11742
11743 static int
11744 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg1, void *arg2)
11745 {
11746         dtrace_ecb_t *ecb;
11747         dtrace_enabling_t *enab = arg1;
11748         dtrace_ecbdesc_t *ep = arg2;
11749         dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11750
11751         ASSERT(state != NULL);
11752
11753         if (probe != NULL && ep != NULL && probe->dtpr_gen < ep->dted_probegen) {
11754                 /*
11755                  * This probe was created in a generation for which this
11756                  * enabling has previously created ECBs; we don't want to
11757                  * enable it again, so just kick out.
11758                  */
11759                 return (DTRACE_MATCH_NEXT);
11760         }
11761
11762         if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11763                 return (DTRACE_MATCH_DONE);
11764
11765         if (dtrace_ecb_enable(ecb) < 0)
11766                return (DTRACE_MATCH_FAIL);
11767
11768         return (DTRACE_MATCH_NEXT);
11769 }
11770
11771 static dtrace_ecb_t *
11772 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11773 {
11774         dtrace_ecb_t *ecb;
11775 #pragma unused(ecb) /* __APPLE__ */
11776
11777         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11778
11779         if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
11780                 return (NULL);
11781
11782         ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11783         ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11784
11785         return (state->dts_ecbs[id - 1]);
11786 }
11787
11788 static dtrace_aggregation_t *
11789 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11790 {
11791         dtrace_aggregation_t *agg;
11792 #pragma unused(agg) /* __APPLE__ */
11793
11794         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11795
11796         if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
11797                 return (NULL);
11798
11799         ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11800         ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11801             agg->dtag_id == id);
11802
11803         return (state->dts_aggregations[id - 1]);
11804 }
11805
11806 /*
11807  * DTrace Buffer Functions
11808  *
11809  * The following functions manipulate DTrace buffers.  Most of these functions
11810  * are called in the context of establishing or processing consumer state;
11811  * exceptions are explicitly noted.
11812  */
11813
11814 /*
11815  * Note:  called from cross call context.  This function switches the two
11816  * buffers on a given CPU.  The atomicity of this operation is assured by
11817  * disabling interrupts while the actual switch takes place; the disabling of
11818  * interrupts serializes the execution with any execution of dtrace_probe() on
11819  * the same CPU.
11820  */
11821 static void
11822 dtrace_buffer_switch(dtrace_buffer_t *buf)
11823 {
11824         caddr_t tomax = buf->dtb_tomax;
11825         caddr_t xamot = buf->dtb_xamot;
11826         dtrace_icookie_t cookie;
11827         hrtime_t now;
11828
11829         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11830         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11831
11832         cookie = dtrace_interrupt_disable();
11833         now = dtrace_gethrtime();
11834         buf->dtb_tomax = xamot;
11835         buf->dtb_xamot = tomax;
11836         buf->dtb_xamot_drops = buf->dtb_drops;
11837         buf->dtb_xamot_offset = buf->dtb_offset;
11838         buf->dtb_xamot_errors = buf->dtb_errors;
11839         buf->dtb_xamot_flags = buf->dtb_flags;
11840         buf->dtb_offset = 0;
11841         buf->dtb_drops = 0;
11842         buf->dtb_errors = 0;
11843         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11844         buf->dtb_interval = now - buf->dtb_switched;
11845         buf->dtb_switched = now;
11846         buf->dtb_cur_limit = buf->dtb_limit;
11847
11848         dtrace_interrupt_enable(cookie);
11849 }
11850
11851 /*
11852  * Note:  called from cross call context.  This function activates a buffer
11853  * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11854  * is guaranteed by the disabling of interrupts.
11855  */
11856 static void
11857 dtrace_buffer_activate(dtrace_state_t *state)
11858 {
11859         dtrace_buffer_t *buf;
11860         dtrace_icookie_t cookie = dtrace_interrupt_disable();
11861
11862         buf = &state->dts_buffer[CPU->cpu_id];
11863
11864         if (buf->dtb_tomax != NULL) {
11865                 /*
11866                  * We might like to assert that the buffer is marked inactive,
11867                  * but this isn't necessarily true:  the buffer for the CPU
11868                  * that processes the BEGIN probe has its buffer activated
11869                  * manually.  In this case, we take the (harmless) action
11870                  * re-clearing the bit INACTIVE bit.
11871                  */
11872                 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11873         }
11874
11875         dtrace_interrupt_enable(cookie);
11876 }
11877
11878 static int
11879 dtrace_buffer_canalloc(size_t size)
11880 {
11881         if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
11882                 return (B_FALSE);
11883         if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
11884                 return (B_FALSE);
11885
11886         return (B_TRUE);
11887 }
11888
11889 static int
11890 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t limit, size_t size, int flags,
11891     processorid_t cpu)
11892 {
11893         dtrace_cpu_t *cp;
11894         dtrace_buffer_t *buf;
11895         size_t size_before_alloc = dtrace_buffer_memory_inuse;
11896
11897         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11898         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11899
11900         if (size > (size_t)dtrace_nonroot_maxsize &&
11901             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11902                 return (EFBIG);
11903
11904         cp = cpu_list;
11905
11906         do {
11907                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11908                         continue;
11909
11910                 buf = &bufs[cp->cpu_id];
11911
11912                 /*
11913                  * If there is already a buffer allocated for this CPU, it
11914                  * is only possible that this is a DR event.  In this case,
11915                  * the buffer size must match our specified size.
11916                  */
11917                 if (buf->dtb_tomax != NULL) {
11918                         ASSERT(buf->dtb_size == size);
11919                         continue;
11920                 }
11921
11922                 ASSERT(buf->dtb_xamot == NULL);
11923
11924                 /* DTrace, please do not eat all the memory. */
11925                 if (dtrace_buffer_canalloc(size) == B_FALSE)
11926                         goto err;
11927                 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11928                         goto err;
11929                 dtrace_buffer_memory_inuse += size;
11930
11931                 /* Unsure that limit is always lower than size */
11932                 limit = limit == size ? limit - 1 : limit;
11933                 buf->dtb_cur_limit = limit;
11934                 buf->dtb_limit = limit;
11935                 buf->dtb_size = size;
11936                 buf->dtb_flags = flags;
11937                 buf->dtb_offset = 0;
11938                 buf->dtb_drops = 0;
11939
11940                 if (flags & DTRACEBUF_NOSWITCH)
11941                         continue;
11942
11943                 /* DTrace, please do not eat all the memory. */
11944                 if (dtrace_buffer_canalloc(size) == B_FALSE)
11945                         goto err;
11946                 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11947                         goto err;
11948                 dtrace_buffer_memory_inuse += size;
11949         } while ((cp = cp->cpu_next) != cpu_list);
11950
11951         ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
11952
11953         return (0);
11954
11955 err:
11956         cp = cpu_list;
11957
11958         do {
11959                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11960                         continue;
11961
11962                 buf = &bufs[cp->cpu_id];
11963
11964                 if (buf->dtb_xamot != NULL) {
11965                         ASSERT(buf->dtb_tomax != NULL);
11966                         ASSERT(buf->dtb_size == size);
11967                         kmem_free(buf->dtb_xamot, size);
11968                 }
11969
11970                 if (buf->dtb_tomax != NULL) {
11971                         ASSERT(buf->dtb_size == size);
11972                         kmem_free(buf->dtb_tomax, size);
11973                 }
11974
11975                 buf->dtb_tomax = NULL;
11976                 buf->dtb_xamot = NULL;
11977                 buf->dtb_size = 0;
11978         } while ((cp = cp->cpu_next) != cpu_list);
11979
11980         /* Restore the size saved before allocating memory */
11981         dtrace_buffer_memory_inuse = size_before_alloc;
11982
11983         return (ENOMEM);
11984 }
11985
11986 /*
11987  * Note:  called from probe context.  This function just increments the drop
11988  * count on a buffer.  It has been made a function to allow for the
11989  * possibility of understanding the source of mysterious drop counts.  (A
11990  * problem for which one may be particularly disappointed that DTrace cannot
11991  * be used to understand DTrace.)
11992  */
11993 static void
11994 dtrace_buffer_drop(dtrace_buffer_t *buf)
11995 {
11996         buf->dtb_drops++;
11997 }
11998
11999 /*
12000  * Note:  called from probe context.  This function is called to reserve space
12001  * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
12002  * mstate.  Returns the new offset in the buffer, or a negative value if an
12003  * error has occurred.
12004  */
12005 static intptr_t
12006 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
12007     dtrace_state_t *state, dtrace_mstate_t *mstate)
12008 {
12009         intptr_t offs = buf->dtb_offset, soffs;
12010         intptr_t woffs;
12011         caddr_t tomax;
12012         size_t total_off;
12013
12014         if (buf->dtb_flags & DTRACEBUF_INACTIVE)
12015                 return (-1);
12016
12017         if ((tomax = buf->dtb_tomax) == NULL) {
12018                 dtrace_buffer_drop(buf);
12019                 return (-1);
12020         }
12021
12022         if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12023                 while (offs & (align - 1)) {
12024                         /*
12025                          * Assert that our alignment is off by a number which
12026                          * is itself sizeof (uint32_t) aligned.
12027                          */
12028                         ASSERT(!((align - (offs & (align - 1))) &
12029                             (sizeof (uint32_t) - 1)));
12030                         DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12031                         offs += sizeof (uint32_t);
12032                 }
12033
12034                 if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) {
12035                         if (buf->dtb_cur_limit == buf->dtb_limit) {
12036                                 buf->dtb_cur_limit = buf->dtb_size;
12037
12038                                 os_atomic_inc(&state->dts_buf_over_limit, relaxed);
12039                                 /**
12040                                  * Set an AST on the current processor
12041                                  * so that we can wake up the process
12042                                  * outside of probe context, when we know
12043                                  * it is safe to do so
12044                                  */
12045                                 minor_t minor = getminor(state->dts_dev);
12046                                 ASSERT(minor < 32);
12047
12048                                 os_atomic_or(&dtrace_wake_clients, 1 << minor, relaxed);
12049                                 ast_dtrace_on();
12050                         }
12051                         if ((uint64_t)soffs > buf->dtb_size) {
12052                                 dtrace_buffer_drop(buf);
12053                                 return (-1);
12054                         }
12055                 }
12056
12057                 if (mstate == NULL)
12058                         return (offs);
12059
12060                 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12061                 mstate->dtms_scratch_size = buf->dtb_size - soffs;
12062                 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12063
12064                 return (offs);
12065         }
12066
12067         if (buf->dtb_flags & DTRACEBUF_FILL) {
12068                 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12069                     (buf->dtb_flags & DTRACEBUF_FULL))
12070                         return (-1);
12071                 goto out;
12072         }
12073
12074         total_off = needed + (offs & (align - 1));
12075
12076         /*
12077          * For a ring buffer, life is quite a bit more complicated.  Before
12078          * we can store any padding, we need to adjust our wrapping offset.
12079          * (If we've never before wrapped or we're not about to, no adjustment
12080          * is required.)
12081          */
12082         if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12083             offs + total_off > buf->dtb_size) {
12084                 woffs = buf->dtb_xamot_offset;
12085
12086                 if (offs + total_off > buf->dtb_size) {
12087                         /*
12088                          * We can't fit in the end of the buffer.  First, a
12089                          * sanity check that we can fit in the buffer at all.
12090                          */
12091                         if (total_off > buf->dtb_size) {
12092                                 dtrace_buffer_drop(buf);
12093                                 return (-1);
12094                         }
12095
12096                         /*
12097                          * We're going to be storing at the top of the buffer,
12098                          * so now we need to deal with the wrapped offset.  We
12099                          * only reset our wrapped offset to 0 if it is
12100                          * currently greater than the current offset.  If it
12101                          * is less than the current offset, it is because a
12102                          * previous allocation induced a wrap -- but the
12103                          * allocation didn't subsequently take the space due
12104                          * to an error or false predicate evaluation.  In this
12105                          * case, we'll just leave the wrapped offset alone: if
12106                          * the wrapped offset hasn't been advanced far enough
12107                          * for this allocation, it will be adjusted in the
12108                          * lower loop.
12109                          */
12110                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12111                                 if (woffs >= offs)
12112                                         woffs = 0;
12113                         } else {
12114                                 woffs = 0;
12115                         }
12116
12117                         /*
12118                          * Now we know that we're going to be storing to the
12119                          * top of the buffer and that there is room for us
12120                          * there.  We need to clear the buffer from the current
12121                          * offset to the end (there may be old gunk there).
12122                          */
12123                         while ((uint64_t)offs < buf->dtb_size)
12124                                 tomax[offs++] = 0;
12125
12126                         /*
12127                          * We need to set our offset to zero.  And because we
12128                          * are wrapping, we need to set the bit indicating as
12129                          * much.  We can also adjust our needed space back
12130                          * down to the space required by the ECB -- we know
12131                          * that the top of the buffer is aligned.
12132                          */
12133                         offs = 0;
12134                         total_off = needed;
12135                         buf->dtb_flags |= DTRACEBUF_WRAPPED;
12136                 } else {
12137                         /*
12138                          * There is room for us in the buffer, so we simply
12139                          * need to check the wrapped offset.
12140                          */
12141                         if (woffs < offs) {
12142                                 /*
12143                                  * The wrapped offset is less than the offset.
12144                                  * This can happen if we allocated buffer space
12145                                  * that induced a wrap, but then we didn't
12146                                  * subsequently take the space due to an error
12147                                  * or false predicate evaluation.  This is
12148                                  * okay; we know that _this_ allocation isn't
12149                                  * going to induce a wrap.  We still can't
12150                                  * reset the wrapped offset to be zero,
12151                                  * however: the space may have been trashed in
12152                                  * the previous failed probe attempt.  But at
12153                                  * least the wrapped offset doesn't need to
12154                                  * be adjusted at all...
12155                                  */
12156                                 goto out;
12157                         }
12158                 }
12159
12160                 while (offs + total_off > (size_t)woffs) {
12161                         dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12162                         size_t size;
12163
12164                         if (epid == DTRACE_EPIDNONE) {
12165                                 size = sizeof (uint32_t);
12166                         } else {
12167                                 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
12168                                 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12169
12170                                 size = state->dts_ecbs[epid - 1]->dte_size;
12171                         }
12172
12173                         ASSERT(woffs + size <= buf->dtb_size);
12174                         ASSERT(size != 0);
12175
12176                         if (woffs + size == buf->dtb_size) {
12177                                 /*
12178                                  * We've reached the end of the buffer; we want
12179                                  * to set the wrapped offset to 0 and break
12180                                  * out.  However, if the offs is 0, then we're
12181                                  * in a strange edge-condition:  the amount of
12182                                  * space that we want to reserve plus the size
12183                                  * of the record that we're overwriting is
12184                                  * greater than the size of the buffer.  This
12185                                  * is problematic because if we reserve the
12186                                  * space but subsequently don't consume it (due
12187                                  * to a failed predicate or error) the wrapped
12188                                  * offset will be 0 -- yet the EPID at offset 0
12189                                  * will not be committed.  This situation is
12190                                  * relatively easy to deal with:  if we're in
12191                                  * this case, the buffer is indistinguishable
12192                                  * from one that hasn't wrapped; we need only
12193                                  * finish the job by clearing the wrapped bit,
12194                                  * explicitly setting the offset to be 0, and
12195                                  * zero'ing out the old data in the buffer.
12196                                  */
12197                                 if (offs == 0) {
12198                                         buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12199                                         buf->dtb_offset = 0;
12200                                         woffs = total_off;
12201
12202                                         while ((uint64_t)woffs < buf->dtb_size)
12203                                                 tomax[woffs++] = 0;
12204                                 }
12205
12206                                 woffs = 0;
12207                                 break;
12208                         }
12209
12210                         woffs += size;
12211                 }
12212
12213                 /*
12214                  * We have a wrapped offset.  It may be that the wrapped offset
12215                  * has become zero -- that's okay.
12216                  */
12217                 buf->dtb_xamot_offset = woffs;
12218         }
12219
12220 out:
12221         /*
12222          * Now we can plow the buffer with any necessary padding.
12223          */
12224         while (offs & (align - 1)) {
12225                 /*
12226                  * Assert that our alignment is off by a number which
12227                  * is itself sizeof (uint32_t) aligned.
12228                  */
12229                 ASSERT(!((align - (offs & (align - 1))) &
12230                     (sizeof (uint32_t) - 1)));
12231                 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12232                 offs += sizeof (uint32_t);
12233         }
12234
12235         if (buf->dtb_flags & DTRACEBUF_FILL) {
12236                 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12237                         buf->dtb_flags |= DTRACEBUF_FULL;
12238                         return (-1);
12239                 }
12240         }
12241
12242         if (mstate == NULL)
12243                 return (offs);
12244
12245         /*
12246          * For ring buffers and fill buffers, the scratch space is always
12247          * the inactive buffer.
12248          */
12249         mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12250         mstate->dtms_scratch_size = buf->dtb_size;
12251         mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12252
12253         return (offs);
12254 }
12255
12256 static void
12257 dtrace_buffer_polish(dtrace_buffer_t *buf)
12258 {
12259         ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12260         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12261
12262         if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12263                 return;
12264
12265         /*
12266          * We need to polish the ring buffer.  There are three cases:
12267          *
12268          * - The first (and presumably most common) is that there is no gap
12269          *   between the buffer offset and the wrapped offset.  In this case,
12270          *   there is nothing in the buffer that isn't valid data; we can
12271          *   mark the buffer as polished and return.
12272          *
12273          * - The second (less common than the first but still more common
12274          *   than the third) is that there is a gap between the buffer offset
12275          *   and the wrapped offset, and the wrapped offset is larger than the
12276          *   buffer offset.  This can happen because of an alignment issue, or
12277          *   can happen because of a call to dtrace_buffer_reserve() that
12278          *   didn't subsequently consume the buffer space.  In this case,
12279          *   we need to zero the data from the buffer offset to the wrapped
12280          *   offset.
12281          *
12282          * - The third (and least common) is that there is a gap between the
12283          *   buffer offset and the wrapped offset, but the wrapped offset is
12284          *   _less_ than the buffer offset.  This can only happen because a
12285          *   call to dtrace_buffer_reserve() induced a wrap, but the space
12286          *   was not subsequently consumed.  In this case, we need to zero the
12287          *   space from the offset to the end of the buffer _and_ from the
12288          *   top of the buffer to the wrapped offset.
12289          */
12290         if (buf->dtb_offset < buf->dtb_xamot_offset) {
12291                 bzero(buf->dtb_tomax + buf->dtb_offset,
12292                     buf->dtb_xamot_offset - buf->dtb_offset);
12293         }
12294
12295         if (buf->dtb_offset > buf->dtb_xamot_offset) {
12296                 bzero(buf->dtb_tomax + buf->dtb_offset,
12297                     buf->dtb_size - buf->dtb_offset);
12298                 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12299         }
12300 }
12301
12302 static void
12303 dtrace_buffer_free(dtrace_buffer_t *bufs)
12304 {
12305         int i;
12306
12307         for (i = 0; i < (int)NCPU; i++) {
12308                 dtrace_buffer_t *buf = &bufs[i];
12309
12310                 if (buf->dtb_tomax == NULL) {
12311                         ASSERT(buf->dtb_xamot == NULL);
12312                         ASSERT(buf->dtb_size == 0);
12313                         continue;
12314                 }
12315
12316                 if (buf->dtb_xamot != NULL) {
12317                         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12318                         kmem_free(buf->dtb_xamot, buf->dtb_size);
12319
12320                         ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12321                         dtrace_buffer_memory_inuse -= buf->dtb_size;
12322                 }
12323
12324                 kmem_free(buf->dtb_tomax, buf->dtb_size);
12325                 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12326                 dtrace_buffer_memory_inuse -= buf->dtb_size;
12327
12328                 buf->dtb_size = 0;
12329                 buf->dtb_tomax = NULL;
12330                 buf->dtb_xamot = NULL;
12331         }
12332 }
12333
12334 /*
12335  * DTrace Enabling Functions
12336  */
12337 static dtrace_enabling_t *
12338 dtrace_enabling_create(dtrace_vstate_t *vstate)
12339 {
12340         dtrace_enabling_t *enab;
12341
12342         enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12343         enab->dten_vstate = vstate;
12344
12345         return (enab);
12346 }
12347
12348 static void
12349 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12350 {
12351         dtrace_ecbdesc_t **ndesc;
12352         size_t osize, nsize;
12353
12354         /*
12355          * We can't add to enablings after we've enabled them, or after we've
12356          * retained them.
12357          */
12358         ASSERT(enab->dten_probegen == 0);
12359         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12360
12361         /* APPLE NOTE: this protects against gcc 4.0 botch on x86 */
12362         if (ecb == NULL) return;
12363
12364         if (enab->dten_ndesc < enab->dten_maxdesc) {
12365                 enab->dten_desc[enab->dten_ndesc++] = ecb;
12366                 return;
12367         }
12368
12369         osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12370
12371         if (enab->dten_maxdesc == 0) {
12372                 enab->dten_maxdesc = 1;
12373         } else {
12374                 enab->dten_maxdesc <<= 1;
12375         }
12376
12377         ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12378
12379         nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12380         ndesc = kmem_zalloc(nsize, KM_SLEEP);
12381         bcopy(enab->dten_desc, ndesc, osize);
12382         kmem_free(enab->dten_desc, osize);
12383
12384         enab->dten_desc = ndesc;
12385         enab->dten_desc[enab->dten_ndesc++] = ecb;
12386 }
12387
12388 static void
12389 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12390     dtrace_probedesc_t *pd)
12391 {
12392         dtrace_ecbdesc_t *new;
12393         dtrace_predicate_t *pred;
12394         dtrace_actdesc_t *act;
12395
12396         /*
12397          * We're going to create a new ECB description that matches the
12398          * specified ECB in every way, but has the specified probe description.
12399          */
12400         new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12401
12402         if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12403                 dtrace_predicate_hold(pred);
12404
12405         for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12406                 dtrace_actdesc_hold(act);
12407
12408         new->dted_action = ecb->dted_action;
12409         new->dted_pred = ecb->dted_pred;
12410         new->dted_probe = *pd;
12411         new->dted_uarg = ecb->dted_uarg;
12412
12413         dtrace_enabling_add(enab, new);
12414 }
12415
12416 static void
12417 dtrace_enabling_dump(dtrace_enabling_t *enab)
12418 {
12419         int i;
12420
12421         for (i = 0; i < enab->dten_ndesc; i++) {
12422                 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12423
12424                 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12425                     desc->dtpd_provider, desc->dtpd_mod,
12426                     desc->dtpd_func, desc->dtpd_name);
12427         }
12428 }
12429
12430 static void
12431 dtrace_enabling_destroy(dtrace_enabling_t *enab)
12432 {
12433         int i;
12434         dtrace_ecbdesc_t *ep;
12435         dtrace_vstate_t *vstate = enab->dten_vstate;
12436
12437         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12438
12439         for (i = 0; i < enab->dten_ndesc; i++) {
12440                 dtrace_actdesc_t *act, *next;
12441                 dtrace_predicate_t *pred;
12442
12443                 ep = enab->dten_desc[i];
12444
12445                 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12446                         dtrace_predicate_release(pred, vstate);
12447
12448                 for (act = ep->dted_action; act != NULL; act = next) {
12449                         next = act->dtad_next;
12450                         dtrace_actdesc_release(act, vstate);
12451                 }
12452
12453                 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12454         }
12455
12456         kmem_free(enab->dten_desc,
12457             enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12458
12459         /*
12460          * If this was a retained enabling, decrement the dts_nretained count
12461          * and take it off of the dtrace_retained list.
12462          */
12463         if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12464             dtrace_retained == enab) {
12465                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12466                 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12467                 enab->dten_vstate->dtvs_state->dts_nretained--;
12468                 dtrace_retained_gen++;
12469         }
12470
12471         if (enab->dten_prev == NULL) {
12472                 if (dtrace_retained == enab) {
12473                         dtrace_retained = enab->dten_next;
12474
12475                         if (dtrace_retained != NULL)
12476                                 dtrace_retained->dten_prev = NULL;
12477                 }
12478         } else {
12479                 ASSERT(enab != dtrace_retained);
12480                 ASSERT(dtrace_retained != NULL);
12481                 enab->dten_prev->dten_next = enab->dten_next;
12482         }
12483
12484         if (enab->dten_next != NULL) {
12485                 ASSERT(dtrace_retained != NULL);
12486                 enab->dten_next->dten_prev = enab->dten_prev;
12487         }
12488
12489         kmem_free(enab, sizeof (dtrace_enabling_t));
12490 }
12491
12492 static int
12493 dtrace_enabling_retain(dtrace_enabling_t *enab)
12494 {
12495         dtrace_state_t *state;
12496
12497         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12498         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12499         ASSERT(enab->dten_vstate != NULL);
12500
12501         state = enab->dten_vstate->dtvs_state;
12502         ASSERT(state != NULL);
12503
12504         /*
12505          * We only allow each state to retain dtrace_retain_max enablings.
12506          */
12507         if (state->dts_nretained >= dtrace_retain_max)
12508                 return (ENOSPC);
12509
12510         state->dts_nretained++;
12511         dtrace_retained_gen++;
12512
12513         if (dtrace_retained == NULL) {
12514                 dtrace_retained = enab;
12515                 return (0);
12516         }
12517
12518         enab->dten_next = dtrace_retained;
12519         dtrace_retained->dten_prev = enab;
12520         dtrace_retained = enab;
12521
12522         return (0);
12523 }
12524
12525 static int
12526 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12527     dtrace_probedesc_t *create)
12528 {
12529         dtrace_enabling_t *new, *enab;
12530         int found = 0, err = ENOENT;
12531
12532         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12533         ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12534         ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12535         ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12536         ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12537
12538         new = dtrace_enabling_create(&state->dts_vstate);
12539
12540         /*
12541          * Iterate over all retained enablings, looking for enablings that
12542          * match the specified state.
12543          */
12544         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12545                 int i;
12546
12547                 /*
12548                  * dtvs_state can only be NULL for helper enablings -- and
12549                  * helper enablings can't be retained.
12550                  */
12551                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12552
12553                 if (enab->dten_vstate->dtvs_state != state)
12554                         continue;
12555
12556                 /*
12557                  * Now iterate over each probe description; we're looking for
12558                  * an exact match to the specified probe description.
12559                  */
12560                 for (i = 0; i < enab->dten_ndesc; i++) {
12561                         dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12562                         dtrace_probedesc_t *pd = &ep->dted_probe;
12563
12564                         /* APPLE NOTE: Darwin employs size bounded string operation. */
12565                         if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
12566                                 continue;
12567
12568                         if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
12569                                 continue;
12570
12571                         if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
12572                                 continue;
12573
12574                         if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
12575                                 continue;
12576
12577                         /*
12578                          * We have a winning probe!  Add it to our growing
12579                          * enabling.
12580                          */
12581                         found = 1;
12582                         dtrace_enabling_addlike(new, ep, create);
12583                 }
12584         }
12585
12586         if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12587                 dtrace_enabling_destroy(new);
12588                 return (err);
12589         }
12590
12591         return (0);
12592 }
12593
12594 static void
12595 dtrace_enabling_retract(dtrace_state_t *state)
12596 {
12597         dtrace_enabling_t *enab, *next;
12598
12599         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12600
12601         /*
12602          * Iterate over all retained enablings, destroy the enablings retained
12603          * for the specified state.
12604          */
12605         for (enab = dtrace_retained; enab != NULL; enab = next) {
12606                 next = enab->dten_next;
12607
12608                 /*
12609                  * dtvs_state can only be NULL for helper enablings -- and
12610                  * helper enablings can't be retained.
12611                  */
12612                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12613
12614                 if (enab->dten_vstate->dtvs_state == state) {
12615                         ASSERT(state->dts_nretained > 0);
12616                         dtrace_enabling_destroy(enab);
12617                 }
12618         }
12619
12620         ASSERT(state->dts_nretained == 0);
12621 }
12622
12623 static int
12624 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched, dtrace_match_cond_t *cond)
12625 {
12626         int i = 0;
12627         int total_matched = 0, matched = 0;
12628
12629         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12630         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12631
12632         for (i = 0; i < enab->dten_ndesc; i++) {
12633                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12634
12635                 enab->dten_current = ep;
12636                 enab->dten_error = 0;
12637
12638                 /**
12639                  * Before doing a dtrace_probe_enable, which is really
12640                  * expensive, check that this enabling matches the matching precondition
12641                  * if we have one
12642                  */
12643                 if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == 0)) {
12644                         continue;
12645                 }
12646                 /*
12647                  * If a provider failed to enable a probe then get out and
12648                  * let the consumer know we failed.
12649                  */
12650                 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab, ep)) < 0)
12651                         return (EBUSY);
12652
12653                 total_matched += matched;
12654
12655                 if (enab->dten_error != 0) {
12656                         /*
12657                          * If we get an error half-way through enabling the
12658                          * probes, we kick out -- perhaps with some number of
12659                          * them enabled.  Leaving enabled probes enabled may
12660                          * be slightly confusing for user-level, but we expect
12661                          * that no one will attempt to actually drive on in
12662                          * the face of such errors.  If this is an anonymous
12663                          * enabling (indicated with a NULL nmatched pointer),
12664                          * we cmn_err() a message.  We aren't expecting to
12665                          * get such an error -- such as it can exist at all,
12666                          * it would be a result of corrupted DOF in the driver
12667                          * properties.
12668                          */
12669                         if (nmatched == NULL) {
12670                                 cmn_err(CE_WARN, "dtrace_enabling_match() "
12671                                     "error on %p: %d", (void *)ep,
12672                                     enab->dten_error);
12673                         }
12674
12675                         return (enab->dten_error);
12676                 }
12677
12678                 ep->dted_probegen = dtrace_probegen;
12679         }
12680
12681         if (nmatched != NULL)
12682                 *nmatched = total_matched;
12683
12684         return (0);
12685 }
12686
12687 static void
12688 dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond)
12689 {
12690         dtrace_enabling_t *enab;
12691
12692         lck_mtx_lock(&cpu_lock);
12693         lck_mtx_lock(&dtrace_lock);
12694
12695         /*
12696          * Iterate over all retained enablings to see if any probes match
12697          * against them.  We only perform this operation on enablings for which
12698          * we have sufficient permissions by virtue of being in the global zone
12699          * or in the same zone as the DTrace client.  Because we can be called
12700          * after dtrace_detach() has been called, we cannot assert that there
12701          * are retained enablings.  We can safely load from dtrace_retained,
12702          * however:  the taskq_destroy() at the end of dtrace_detach() will
12703          * block pending our completion.
12704          */
12705
12706         /*
12707          * Darwin doesn't do zones.
12708          * Behave as if always in "global" zone."
12709          */
12710         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12711                 (void) dtrace_enabling_match(enab, NULL, cond);
12712         }
12713
12714         lck_mtx_unlock(&dtrace_lock);
12715         lck_mtx_unlock(&cpu_lock);
12716
12717 }
12718
12719 static void
12720 dtrace_enabling_matchall(void)
12721 {
12722         dtrace_enabling_matchall_with_cond(NULL);
12723 }
12724
12725
12726
12727 /*
12728  * If an enabling is to be enabled without having matched probes (that is, if
12729  * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12730  * enabling must be _primed_ by creating an ECB for every ECB description.
12731  * This must be done to assure that we know the number of speculations, the
12732  * number of aggregations, the minimum buffer size needed, etc. before we
12733  * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12734  * enabling any probes, we create ECBs for every ECB decription, but with a
12735  * NULL probe -- which is exactly what this function does.
12736  */
12737 static void
12738 dtrace_enabling_prime(dtrace_state_t *state)
12739 {
12740         dtrace_enabling_t *enab;
12741         int i;
12742
12743         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12744                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12745
12746                 if (enab->dten_vstate->dtvs_state != state)
12747                         continue;
12748
12749                 /*
12750                  * We don't want to prime an enabling more than once, lest
12751                  * we allow a malicious user to induce resource exhaustion.
12752                  * (The ECBs that result from priming an enabling aren't
12753                  * leaked -- but they also aren't deallocated until the
12754                  * consumer state is destroyed.)
12755                  */
12756                 if (enab->dten_primed)
12757                         continue;
12758
12759                 for (i = 0; i < enab->dten_ndesc; i++) {
12760                         enab->dten_current = enab->dten_desc[i];
12761                         (void) dtrace_probe_enable(NULL, enab, NULL);
12762                 }
12763
12764                 enab->dten_primed = 1;
12765         }
12766 }
12767
12768 /*
12769  * Called to indicate that probes should be provided due to retained
12770  * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12771  * must take an initial lap through the enabling calling the dtps_provide()
12772  * entry point explicitly to allow for autocreated probes.
12773  */
12774 static void
12775 dtrace_enabling_provide(dtrace_provider_t *prv)
12776 {
12777         int i, all = 0;
12778         dtrace_probedesc_t desc;
12779         dtrace_genid_t gen;
12780
12781         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12782         LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12783
12784         if (prv == NULL) {
12785                 all = 1;
12786                 prv = dtrace_provider;
12787         }
12788
12789         do {
12790                 dtrace_enabling_t *enab;
12791                 void *parg = prv->dtpv_arg;
12792
12793 retry:
12794                 gen = dtrace_retained_gen;
12795                 for (enab = dtrace_retained; enab != NULL;
12796                     enab = enab->dten_next) {
12797                         for (i = 0; i < enab->dten_ndesc; i++) {
12798                                 desc = enab->dten_desc[i]->dted_probe;
12799                                 lck_mtx_unlock(&dtrace_lock);
12800                                 prv->dtpv_pops.dtps_provide(parg, &desc);
12801                                 lck_mtx_lock(&dtrace_lock);
12802                                 /*
12803                                  * Process the retained enablings again if
12804                                  * they have changed while we weren't holding
12805                                  * dtrace_lock.
12806                                  */
12807                                 if (gen != dtrace_retained_gen)
12808                                         goto retry;
12809                         }
12810                 }
12811         } while (all && (prv = prv->dtpv_next) != NULL);
12812
12813         lck_mtx_unlock(&dtrace_lock);
12814         dtrace_probe_provide(NULL, all ? NULL : prv);
12815         lck_mtx_lock(&dtrace_lock);
12816 }
12817
12818 /*
12819  * DTrace DOF Functions
12820  */
12821 /*ARGSUSED*/
12822 static void
12823 dtrace_dof_error(dof_hdr_t *dof, const char *str)
12824 {
12825 #pragma unused(dof) /* __APPLE__ */
12826         if (dtrace_err_verbose)
12827                 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12828
12829 #ifdef DTRACE_ERRDEBUG
12830         dtrace_errdebug(str);
12831 #endif
12832 }
12833
12834 /*
12835  * Create DOF out of a currently enabled state.  Right now, we only create
12836  * DOF containing the run-time options -- but this could be expanded to create
12837  * complete DOF representing the enabled state.
12838  */
12839 static dof_hdr_t *
12840 dtrace_dof_create(dtrace_state_t *state)
12841 {
12842         dof_hdr_t *dof;
12843         dof_sec_t *sec;
12844         dof_optdesc_t *opt;
12845         int i, len = sizeof (dof_hdr_t) +
12846             roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12847             sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12848
12849         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12850
12851         dof = kmem_zalloc_aligned(len, 8, KM_SLEEP);
12852         dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12853         dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12854         dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12855         dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12856
12857         dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12858         dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12859         dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12860         dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12861         dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12862         dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12863
12864         dof->dofh_flags = 0;
12865         dof->dofh_hdrsize = sizeof (dof_hdr_t);
12866         dof->dofh_secsize = sizeof (dof_sec_t);
12867         dof->dofh_secnum = 1;   /* only DOF_SECT_OPTDESC */
12868         dof->dofh_secoff = sizeof (dof_hdr_t);
12869         dof->dofh_loadsz = len;
12870         dof->dofh_filesz = len;
12871         dof->dofh_pad = 0;
12872
12873         /*
12874          * Fill in the option section header...
12875          */
12876         sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12877         sec->dofs_type = DOF_SECT_OPTDESC;
12878         sec->dofs_align = sizeof (uint64_t);
12879         sec->dofs_flags = DOF_SECF_LOAD;
12880         sec->dofs_entsize = sizeof (dof_optdesc_t);
12881
12882         opt = (dof_optdesc_t *)((uintptr_t)sec +
12883             roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12884
12885         sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12886         sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12887
12888         for (i = 0; i < DTRACEOPT_MAX; i++) {
12889                 opt[i].dofo_option = i;
12890                 opt[i].dofo_strtab = DOF_SECIDX_NONE;
12891                 opt[i].dofo_value = state->dts_options[i];
12892         }
12893
12894         return (dof);
12895 }
12896
12897 static dof_hdr_t *
12898 dtrace_dof_copyin(user_addr_t uarg, int *errp)
12899 {
12900         dof_hdr_t hdr, *dof;
12901
12902         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12903
12904         /*
12905          * First, we're going to copyin() the sizeof (dof_hdr_t).
12906          */
12907         if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
12908                 dtrace_dof_error(NULL, "failed to copyin DOF header");
12909                 *errp = EFAULT;
12910                 return (NULL);
12911         }
12912
12913         /*
12914          * Now we'll allocate the entire DOF and copy it in -- provided
12915          * that the length isn't outrageous.
12916          */
12917         if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12918                 dtrace_dof_error(&hdr, "load size exceeds maximum");
12919                 *errp = E2BIG;
12920                 return (NULL);
12921         }
12922
12923         if (hdr.dofh_loadsz < sizeof (hdr)) {
12924                 dtrace_dof_error(&hdr, "invalid load size");
12925                 *errp = EINVAL;
12926                 return (NULL);
12927         }
12928
12929         dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12930
12931         if (copyin(uarg, dof, hdr.dofh_loadsz) != 0  ||
12932           dof->dofh_loadsz != hdr.dofh_loadsz) {
12933             kmem_free_aligned(dof, hdr.dofh_loadsz);
12934             *errp = EFAULT;
12935             return (NULL);
12936         }
12937
12938         return (dof);
12939 }
12940
12941 static dof_hdr_t *
12942 dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
12943 {
12944         dof_hdr_t hdr, *dof;
12945
12946         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12947
12948         /*
12949          * First, we're going to copyin() the sizeof (dof_hdr_t).
12950          */
12951         if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
12952                 dtrace_dof_error(NULL, "failed to copyin DOF header");
12953                 *errp = EFAULT;
12954                 return (NULL);
12955         }
12956
12957         /*
12958          * Now we'll allocate the entire DOF and copy it in -- provided
12959          * that the length isn't outrageous.
12960          */
12961         if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12962                 dtrace_dof_error(&hdr, "load size exceeds maximum");
12963                 *errp = E2BIG;
12964                 return (NULL);
12965         }
12966
12967         if (hdr.dofh_loadsz < sizeof (hdr)) {
12968                 dtrace_dof_error(&hdr, "invalid load size");
12969                 *errp = EINVAL;
12970                 return (NULL);
12971         }
12972
12973         dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12974
12975         if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
12976                 kmem_free_aligned(dof, hdr.dofh_loadsz);
12977                 *errp = EFAULT;
12978                 return (NULL);
12979         }
12980
12981         return (dof);
12982 }
12983
12984 static void
12985 dtrace_dof_destroy(dof_hdr_t *dof)
12986 {
12987         kmem_free_aligned(dof, dof->dofh_loadsz);
12988 }
12989
12990 static dof_hdr_t *
12991 dtrace_dof_property(const char *name)
12992 {
12993         unsigned int len = 0;
12994         dof_hdr_t *dof;
12995
12996         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
12997                 return NULL;
12998         }
12999
13000         if (!PEReadNVRAMProperty(name, NULL, &len)) {
13001                 return NULL;
13002         }
13003
13004         dof = kmem_alloc_aligned(len, 8, KM_SLEEP);
13005
13006         if (!PEReadNVRAMProperty(name, dof, &len)) {
13007                 dtrace_dof_destroy(dof);
13008                 dtrace_dof_error(NULL, "unreadable DOF");
13009                 return NULL;
13010         }
13011
13012         if (len < sizeof (dof_hdr_t)) {
13013                 dtrace_dof_destroy(dof);
13014                 dtrace_dof_error(NULL, "truncated header");
13015                 return (NULL);
13016         }
13017
13018         if (len < dof->dofh_loadsz) {
13019                 dtrace_dof_destroy(dof);
13020                 dtrace_dof_error(NULL, "truncated DOF");
13021                 return (NULL);
13022         }
13023
13024         if (len != dof->dofh_loadsz) {
13025                 dtrace_dof_destroy(dof);
13026                 dtrace_dof_error(NULL, "invalid DOF size");
13027                 return (NULL);
13028         }
13029
13030         if (dof->dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13031                 dtrace_dof_destroy(dof);
13032                 dtrace_dof_error(NULL, "oversized DOF");
13033                 return (NULL);
13034         }
13035
13036         return (dof);
13037 }
13038
13039 /*
13040  * Return the dof_sec_t pointer corresponding to a given section index.  If the
13041  * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
13042  * a type other than DOF_SECT_NONE is specified, the header is checked against
13043  * this type and NULL is returned if the types do not match.
13044  */
13045 static dof_sec_t *
13046 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13047 {
13048         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13049             ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13050
13051         if (i >= dof->dofh_secnum) {
13052                 dtrace_dof_error(dof, "referenced section index is invalid");
13053                 return (NULL);
13054         }
13055
13056         if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13057                 dtrace_dof_error(dof, "referenced section is not loadable");
13058                 return (NULL);
13059         }
13060
13061         if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13062                 dtrace_dof_error(dof, "referenced section is the wrong type");
13063                 return (NULL);
13064         }
13065
13066         return (sec);
13067 }
13068
13069 static dtrace_probedesc_t *
13070 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13071 {
13072         dof_probedesc_t *probe;
13073         dof_sec_t *strtab;
13074         uintptr_t daddr = (uintptr_t)dof;
13075         uintptr_t str;
13076         size_t size;
13077
13078         if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13079                 dtrace_dof_error(dof, "invalid probe section");
13080                 return (NULL);
13081         }
13082
13083         if (sec->dofs_align != sizeof (dof_secidx_t)) {
13084                 dtrace_dof_error(dof, "bad alignment in probe description");
13085                 return (NULL);
13086         }
13087
13088         if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13089                 dtrace_dof_error(dof, "truncated probe description");
13090                 return (NULL);
13091         }
13092
13093         probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13094         strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13095
13096         if (strtab == NULL)
13097                 return (NULL);
13098
13099         str = daddr + strtab->dofs_offset;
13100         size = strtab->dofs_size;
13101
13102         if (probe->dofp_provider >= strtab->dofs_size) {
13103                 dtrace_dof_error(dof, "corrupt probe provider");
13104                 return (NULL);
13105         }
13106
13107         (void) strncpy(desc->dtpd_provider,
13108             (char *)(str + probe->dofp_provider),
13109             MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13110
13111         /* APPLE NOTE: Darwin employs size bounded string operation. */
13112         desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
13113
13114         if (probe->dofp_mod >= strtab->dofs_size) {
13115                 dtrace_dof_error(dof, "corrupt probe module");
13116                 return (NULL);
13117         }
13118
13119         (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13120             MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13121
13122         /* APPLE NOTE: Darwin employs size bounded string operation. */
13123         desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
13124
13125         if (probe->dofp_func >= strtab->dofs_size) {
13126                 dtrace_dof_error(dof, "corrupt probe function");
13127                 return (NULL);
13128         }
13129
13130         (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13131             MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13132
13133         /* APPLE NOTE: Darwin employs size bounded string operation. */
13134         desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
13135
13136         if (probe->dofp_name >= strtab->dofs_size) {
13137                 dtrace_dof_error(dof, "corrupt probe name");
13138                 return (NULL);
13139         }
13140
13141         (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13142             MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13143
13144         /* APPLE NOTE: Darwin employs size bounded string operation. */
13145         desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
13146
13147         return (desc);
13148 }
13149
13150 static dtrace_difo_t *
13151 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13152     cred_t *cr)
13153 {
13154         dtrace_difo_t *dp;
13155         size_t ttl = 0;
13156         dof_difohdr_t *dofd;
13157         uintptr_t daddr = (uintptr_t)dof;
13158         size_t max_size = dtrace_difo_maxsize;
13159         uint_t i;
13160         int l, n;
13161
13162
13163         static const struct {
13164                 int section;
13165                 int bufoffs;
13166                 int lenoffs;
13167                 int entsize;
13168                 int align;
13169                 const char *msg;
13170         } difo[] = {
13171                 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13172                 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13173                 sizeof (dif_instr_t), "multiple DIF sections" },
13174
13175                 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13176                 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13177                 sizeof (uint64_t), "multiple integer tables" },
13178
13179                 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13180                 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13181                 sizeof (char), "multiple string tables" },
13182
13183                 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13184                 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13185                 sizeof (uint_t), "multiple variable tables" },
13186
13187                 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13188         };
13189
13190         if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13191                 dtrace_dof_error(dof, "invalid DIFO header section");
13192                 return (NULL);
13193         }
13194
13195         if (sec->dofs_align != sizeof (dof_secidx_t)) {
13196                 dtrace_dof_error(dof, "bad alignment in DIFO header");
13197                 return (NULL);
13198         }
13199
13200         if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13201             sec->dofs_size % sizeof (dof_secidx_t)) {
13202                 dtrace_dof_error(dof, "bad size in DIFO header");
13203                 return (NULL);
13204         }
13205
13206         dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13207         n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13208
13209         dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13210         dp->dtdo_rtype = dofd->dofd_rtype;
13211
13212         for (l = 0; l < n; l++) {
13213                 dof_sec_t *subsec;
13214                 void **bufp;
13215                 uint32_t *lenp;
13216
13217                 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13218                     dofd->dofd_links[l])) == NULL)
13219                         goto err; /* invalid section link */
13220
13221                 if (ttl + subsec->dofs_size > max_size) {
13222                         dtrace_dof_error(dof, "exceeds maximum size");
13223                         goto err;
13224                 }
13225
13226                 ttl += subsec->dofs_size;
13227
13228                 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13229
13230                         if (subsec->dofs_type != (uint32_t)difo[i].section)
13231                                 continue;
13232
13233                         if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13234                                 dtrace_dof_error(dof, "section not loaded");
13235                                 goto err;
13236                         }
13237
13238                         if (subsec->dofs_align != (uint32_t)difo[i].align) {
13239                                 dtrace_dof_error(dof, "bad alignment");
13240                                 goto err;
13241                         }
13242
13243                         bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13244                         lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13245
13246                         if (*bufp != NULL) {
13247                                 dtrace_dof_error(dof, difo[i].msg);
13248                                 goto err;
13249                         }
13250
13251                         if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
13252                                 dtrace_dof_error(dof, "entry size mismatch");
13253                                 goto err;
13254                         }
13255
13256                         if (subsec->dofs_entsize != 0 &&
13257                             (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13258                                 dtrace_dof_error(dof, "corrupt entry size");
13259                                 goto err;
13260                         }
13261
13262                         *lenp = subsec->dofs_size;
13263                         *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13264                         bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13265                             *bufp, subsec->dofs_size);
13266
13267                         if (subsec->dofs_entsize != 0)
13268                                 *lenp /= subsec->dofs_entsize;
13269
13270                         break;
13271                 }
13272
13273                 /*
13274                  * If we encounter a loadable DIFO sub-section that is not
13275                  * known to us, assume this is a broken program and fail.
13276                  */
13277                 if (difo[i].section == DOF_SECT_NONE &&
13278                     (subsec->dofs_flags & DOF_SECF_LOAD)) {
13279                         dtrace_dof_error(dof, "unrecognized DIFO subsection");
13280                         goto err;
13281                 }
13282         }
13283
13284         if (dp->dtdo_buf == NULL) {
13285                 /*
13286                  * We can't have a DIF object without DIF text.
13287                  */
13288                 dtrace_dof_error(dof, "missing DIF text");
13289                 goto err;
13290         }
13291
13292         /*
13293          * Before we validate the DIF object, run through the variable table
13294          * looking for the strings -- if any of their size are under, we'll set
13295          * their size to be the system-wide default string size.  Note that
13296          * this should _not_ happen if the "strsize" option has been set --
13297          * in this case, the compiler should have set the size to reflect the
13298          * setting of the option.
13299          */
13300         for (i = 0; i < dp->dtdo_varlen; i++) {
13301                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13302                 dtrace_diftype_t *t = &v->dtdv_type;
13303
13304                 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13305                         continue;
13306
13307                 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13308                         t->dtdt_size = dtrace_strsize_default;
13309         }
13310
13311         if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13312                 goto err;
13313
13314         dtrace_difo_init(dp, vstate);
13315         return (dp);
13316
13317 err:
13318         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13319         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13320         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13321         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13322
13323         kmem_free(dp, sizeof (dtrace_difo_t));
13324         return (NULL);
13325 }
13326
13327 static dtrace_predicate_t *
13328 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13329     cred_t *cr)
13330 {
13331         dtrace_difo_t *dp;
13332
13333         if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13334                 return (NULL);
13335
13336         return (dtrace_predicate_create(dp));
13337 }
13338
13339 static dtrace_actdesc_t *
13340 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13341     cred_t *cr)
13342 {
13343         dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13344         dof_actdesc_t *desc;
13345         dof_sec_t *difosec;
13346         size_t offs;
13347         uintptr_t daddr = (uintptr_t)dof;
13348         uint64_t arg;
13349         dtrace_actkind_t kind;
13350
13351         if (sec->dofs_type != DOF_SECT_ACTDESC) {
13352                 dtrace_dof_error(dof, "invalid action section");
13353                 return (NULL);
13354         }
13355
13356         if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13357                 dtrace_dof_error(dof, "truncated action description");
13358                 return (NULL);
13359         }
13360
13361         if (sec->dofs_align != sizeof (uint64_t)) {
13362                 dtrace_dof_error(dof, "bad alignment in action description");
13363                 return (NULL);
13364         }
13365
13366         if (sec->dofs_size < sec->dofs_entsize) {
13367                 dtrace_dof_error(dof, "section entry size exceeds total size");
13368                 return (NULL);
13369         }
13370
13371         if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13372                 dtrace_dof_error(dof, "bad entry size in action description");
13373                 return (NULL);
13374         }
13375
13376         if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13377                 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13378                 return (NULL);
13379         }
13380
13381         for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13382                 desc = (dof_actdesc_t *)(daddr +
13383                     (uintptr_t)sec->dofs_offset + offs);
13384                 kind = (dtrace_actkind_t)desc->dofa_kind;
13385
13386                 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13387                     (kind != DTRACEACT_PRINTA || desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13388                     (kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE))
13389                 {
13390                         dof_sec_t *strtab;
13391                         char *str, *fmt;
13392                         uint64_t i;
13393
13394                         /*
13395                          * The argument to these actions is an index into the
13396                          * DOF string table.  For printf()-like actions, this
13397                          * is the format string.  For print(), this is the
13398                          * CTF type of the expression result.
13399                          */
13400                         if ((strtab = dtrace_dof_sect(dof,
13401                             DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13402                                 goto err;
13403
13404                         str = (char *)((uintptr_t)dof +
13405                             (uintptr_t)strtab->dofs_offset);
13406
13407                         for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13408                                 if (str[i] == '\0')
13409                                         break;
13410                         }
13411
13412                         if (i >= strtab->dofs_size) {
13413                                 dtrace_dof_error(dof, "bogus format string");
13414                                 goto err;
13415                         }
13416
13417                         if (i == desc->dofa_arg) {
13418                                 dtrace_dof_error(dof, "empty format string");
13419                                 goto err;
13420                         }
13421
13422                         i -= desc->dofa_arg;
13423                         fmt = kmem_alloc(i + 1, KM_SLEEP);
13424                         bcopy(&str[desc->dofa_arg], fmt, i + 1);
13425                         arg = (uint64_t)(uintptr_t)fmt;
13426                 } else {
13427                         if (kind == DTRACEACT_PRINTA) {
13428                                 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13429                                 arg = 0;
13430                         } else {
13431                                 arg = desc->dofa_arg;
13432                         }
13433                 }
13434
13435                 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13436                     desc->dofa_uarg, arg);
13437
13438                 if (last != NULL) {
13439                         last->dtad_next = act;
13440                 } else {
13441                         first = act;
13442                 }
13443
13444                 last = act;
13445
13446                 if (desc->dofa_difo == DOF_SECIDX_NONE)
13447                         continue;
13448
13449                 if ((difosec = dtrace_dof_sect(dof,
13450                     DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13451                         goto err;
13452
13453                 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13454
13455                 if (act->dtad_difo == NULL)
13456                         goto err;
13457         }
13458
13459         ASSERT(first != NULL);
13460         return (first);
13461
13462 err:
13463         for (act = first; act != NULL; act = next) {
13464                 next = act->dtad_next;
13465                 dtrace_actdesc_release(act, vstate);
13466         }
13467
13468         return (NULL);
13469 }
13470
13471 static dtrace_ecbdesc_t *
13472 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13473     cred_t *cr)
13474 {
13475         dtrace_ecbdesc_t *ep;
13476         dof_ecbdesc_t *ecb;
13477         dtrace_probedesc_t *desc;
13478         dtrace_predicate_t *pred = NULL;
13479
13480         if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13481                 dtrace_dof_error(dof, "truncated ECB description");
13482                 return (NULL);
13483         }
13484
13485         if (sec->dofs_align != sizeof (uint64_t)) {
13486                 dtrace_dof_error(dof, "bad alignment in ECB description");
13487                 return (NULL);
13488         }
13489
13490         ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13491         sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13492
13493         if (sec == NULL)
13494                 return (NULL);
13495
13496         ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13497         ep->dted_uarg = ecb->dofe_uarg;
13498         desc = &ep->dted_probe;
13499
13500         if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13501                 goto err;
13502
13503         if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13504                 if ((sec = dtrace_dof_sect(dof,
13505                     DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13506                         goto err;
13507
13508                 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13509                         goto err;
13510
13511                 ep->dted_pred.dtpdd_predicate = pred;
13512         }
13513
13514         if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13515                 if ((sec = dtrace_dof_sect(dof,
13516                     DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13517                         goto err;
13518
13519                 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13520
13521                 if (ep->dted_action == NULL)
13522                         goto err;
13523         }
13524
13525         return (ep);
13526
13527 err:
13528         if (pred != NULL)
13529                 dtrace_predicate_release(pred, vstate);
13530         kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13531         return (NULL);
13532 }
13533
13534 /*
13535  * APPLE NOTE: dyld handles dof relocation.
13536  * Darwin does not need dtrace_dof_relocate()
13537  */
13538
13539 /*
13540  * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13541  * header:  it should be at the front of a memory region that is at least
13542  * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13543  * size.  It need not be validated in any other way.
13544  */
13545 static int
13546 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13547     dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13548 {
13549 #pragma unused(ubase) /* __APPLE__ */
13550         uint64_t len = dof->dofh_loadsz, seclen;
13551         uintptr_t daddr = (uintptr_t)dof;
13552         dtrace_ecbdesc_t *ep;
13553         dtrace_enabling_t *enab;
13554         uint_t i;
13555
13556         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13557         ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13558
13559         /*
13560          * Check the DOF header identification bytes.  In addition to checking
13561          * valid settings, we also verify that unused bits/bytes are zeroed so
13562          * we can use them later without fear of regressing existing binaries.
13563          */
13564         if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13565             DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13566                 dtrace_dof_error(dof, "DOF magic string mismatch");
13567                 return (-1);
13568         }
13569
13570         if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13571             dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13572                 dtrace_dof_error(dof, "DOF has invalid data model");
13573                 return (-1);
13574         }
13575
13576         if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13577                 dtrace_dof_error(dof, "DOF encoding mismatch");
13578                 return (-1);
13579         }
13580
13581         /*
13582          * APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
13583          */
13584         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
13585                 dtrace_dof_error(dof, "DOF version mismatch");
13586                 return (-1);
13587         }
13588
13589         if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13590                 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13591                 return (-1);
13592         }
13593
13594         if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13595                 dtrace_dof_error(dof, "DOF uses too many integer registers");
13596                 return (-1);
13597         }
13598
13599         if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13600                 dtrace_dof_error(dof, "DOF uses too many tuple registers");
13601                 return (-1);
13602         }
13603
13604         for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13605                 if (dof->dofh_ident[i] != 0) {
13606                         dtrace_dof_error(dof, "DOF has invalid ident byte set");
13607                         return (-1);
13608                 }
13609         }
13610
13611         if (dof->dofh_flags & ~DOF_FL_VALID) {
13612                 dtrace_dof_error(dof, "DOF has invalid flag bits set");
13613                 return (-1);
13614         }
13615
13616         if (dof->dofh_secsize < sizeof(dof_sec_t)) {
13617                 dtrace_dof_error(dof, "invalid section header size");
13618                 return (-1);
13619         }
13620
13621         /*
13622          * Check that the section headers don't exceed the amount of DOF
13623          * data.  Note that we cast the section size and number of sections
13624          * to uint64_t's to prevent possible overflow in the multiplication.
13625          */
13626         seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13627
13628         if (dof->dofh_secoff > len || seclen > len ||
13629             dof->dofh_secoff + seclen > len) {
13630                 dtrace_dof_error(dof, "truncated section headers");
13631                 return (-1);
13632         }
13633
13634         if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13635                 dtrace_dof_error(dof, "misaligned section headers");
13636                 return (-1);
13637         }
13638
13639         if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13640                 dtrace_dof_error(dof, "misaligned section size");
13641                 return (-1);
13642         }
13643
13644         /*
13645          * Take an initial pass through the section headers to be sure that
13646          * the headers don't have stray offsets.  If the 'noprobes' flag is
13647          * set, do not permit sections relating to providers, probes, or args.
13648          */
13649         for (i = 0; i < dof->dofh_secnum; i++) {
13650                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13651                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13652
13653                 if (noprobes) {
13654                         switch (sec->dofs_type) {
13655                         case DOF_SECT_PROVIDER:
13656                         case DOF_SECT_PROBES:
13657                         case DOF_SECT_PRARGS:
13658                         case DOF_SECT_PROFFS:
13659                                 dtrace_dof_error(dof, "illegal sections "
13660                                     "for enabling");
13661                                 return (-1);
13662                         }
13663                 }
13664
13665                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13666                         continue; /* just ignore non-loadable sections */
13667
13668                 if (sec->dofs_align & (sec->dofs_align - 1)) {
13669                         dtrace_dof_error(dof, "bad section alignment");
13670                         return (-1);
13671                 }
13672
13673                 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13674                         dtrace_dof_error(dof, "misaligned section");
13675                         return (-1);
13676                 }
13677
13678                 if (sec->dofs_offset > len || sec->dofs_size > len ||
13679                     sec->dofs_offset + sec->dofs_size > len) {
13680                         dtrace_dof_error(dof, "corrupt section header");
13681                         return (-1);
13682                 }
13683
13684                 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13685                     sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13686                         dtrace_dof_error(dof, "non-terminating string table");
13687                         return (-1);
13688                 }
13689         }
13690
13691         /*
13692          * APPLE NOTE: We have no further relocation to perform.
13693          * All dof values are relative offsets.
13694          */
13695
13696         if ((enab = *enabp) == NULL)
13697                 enab = *enabp = dtrace_enabling_create(vstate);
13698
13699         for (i = 0; i < dof->dofh_secnum; i++) {
13700                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13701                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13702
13703                 if (sec->dofs_type != DOF_SECT_ECBDESC)
13704                         continue;
13705
13706                 /*
13707                  * APPLE NOTE: Defend against gcc 4.0 botch on x86.
13708                  * not all paths out of inlined dtrace_dof_ecbdesc
13709                  * are checked for the NULL return value.
13710                  * Check for NULL explicitly here.
13711                 */
13712                 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13713                 if (ep == NULL) {
13714                         dtrace_enabling_destroy(enab);
13715                         *enabp = NULL;
13716                         return (-1);
13717                 }
13718
13719                 dtrace_enabling_add(enab, ep);
13720         }
13721
13722         return (0);
13723 }
13724
13725 /*
13726  * Process DOF for any options.  This routine assumes that the DOF has been
13727  * at least processed by dtrace_dof_slurp().
13728  */
13729 static int
13730 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13731 {
13732         uint_t i;
13733         int rval;
13734         uint32_t entsize;
13735         size_t offs;
13736         dof_optdesc_t *desc;
13737
13738         for (i = 0; i < dof->dofh_secnum; i++) {
13739                 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13740                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13741
13742                 if (sec->dofs_type != DOF_SECT_OPTDESC)
13743                         continue;
13744
13745                 if (sec->dofs_align != sizeof (uint64_t)) {
13746                         dtrace_dof_error(dof, "bad alignment in "
13747                             "option description");
13748                         return (EINVAL);
13749                 }
13750
13751                 if ((entsize = sec->dofs_entsize) == 0) {
13752                         dtrace_dof_error(dof, "zeroed option entry size");
13753                         return (EINVAL);
13754                 }
13755
13756                 if (entsize < sizeof (dof_optdesc_t)) {
13757                         dtrace_dof_error(dof, "bad option entry size");
13758                         return (EINVAL);
13759                 }
13760
13761                 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13762                         desc = (dof_optdesc_t *)((uintptr_t)dof +
13763                             (uintptr_t)sec->dofs_offset + offs);
13764
13765                         if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13766                                 dtrace_dof_error(dof, "non-zero option string");
13767                                 return (EINVAL);
13768                         }
13769
13770                         if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13771                                 dtrace_dof_error(dof, "unset option");
13772                                 return (EINVAL);
13773                         }
13774
13775                         if ((rval = dtrace_state_option(state,
13776                             desc->dofo_option, desc->dofo_value)) != 0) {
13777                                 dtrace_dof_error(dof, "rejected option");
13778                                 return (rval);
13779                         }
13780                 }
13781         }
13782
13783         return (0);
13784 }
13785
13786 /*
13787  * DTrace Consumer State Functions
13788  */
13789 static int
13790 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13791 {
13792         size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13793         void *base;
13794         uintptr_t limit;
13795         dtrace_dynvar_t *dvar, *next, *start;
13796         size_t i;
13797
13798         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13799         ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13800
13801         bzero(dstate, sizeof (dtrace_dstate_t));
13802
13803         if ((dstate->dtds_chunksize = chunksize) == 0)
13804                 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13805
13806         VERIFY(dstate->dtds_chunksize < (LONG_MAX - sizeof (dtrace_dynhash_t)));
13807
13808         if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13809                 size = min_size;
13810
13811         if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13812                 return (ENOMEM);
13813
13814         dstate->dtds_size = size;
13815         dstate->dtds_base = base;
13816         dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13817         bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
13818
13819         hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13820
13821         if (hashsize != 1 && (hashsize & 1))
13822                 hashsize--;
13823
13824         dstate->dtds_hashsize = hashsize;
13825         dstate->dtds_hash = dstate->dtds_base;
13826
13827         /*
13828          * Set all of our hash buckets to point to the single sink, and (if
13829          * it hasn't already been set), set the sink's hash value to be the
13830          * sink sentinel value.  The sink is needed for dynamic variable
13831          * lookups to know that they have iterated over an entire, valid hash
13832          * chain.
13833          */
13834         for (i = 0; i < hashsize; i++)
13835                 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13836
13837         if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13838                 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13839
13840         /*
13841          * Determine number of active CPUs.  Divide free list evenly among
13842          * active CPUs.
13843          */
13844         start = (dtrace_dynvar_t *)
13845             ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13846         limit = (uintptr_t)base + size;
13847
13848         VERIFY((uintptr_t)start < limit);
13849         VERIFY((uintptr_t)start >= (uintptr_t)base);
13850
13851         maxper = (limit - (uintptr_t)start) / (int)NCPU;
13852         maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13853
13854         for (i = 0; i < NCPU; i++) {
13855                 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13856
13857                 /*
13858                  * If we don't even have enough chunks to make it once through
13859                  * NCPUs, we're just going to allocate everything to the first
13860                  * CPU.  And if we're on the last CPU, we're going to allocate
13861                  * whatever is left over.  In either case, we set the limit to
13862                  * be the limit of the dynamic variable space.
13863                  */
13864                 if (maxper == 0 || i == NCPU - 1) {
13865                         limit = (uintptr_t)base + size;
13866                         start = NULL;
13867                 } else {
13868                         limit = (uintptr_t)start + maxper;
13869                         start = (dtrace_dynvar_t *)limit;
13870                 }
13871
13872                 VERIFY(limit <= (uintptr_t)base + size);
13873
13874                 for (;;) {
13875                         next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13876                             dstate->dtds_chunksize);
13877
13878                         if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13879                                 break;
13880
13881                         VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
13882                             (uintptr_t)dvar <= (uintptr_t)base + size);
13883                         dvar->dtdv_next = next;
13884                         dvar = next;
13885                 }
13886
13887                 if (maxper == 0)
13888                         break;
13889         }
13890
13891         return (0);
13892 }
13893
13894 static void
13895 dtrace_dstate_fini(dtrace_dstate_t *dstate)
13896 {
13897         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13898
13899         if (dstate->dtds_base == NULL)
13900                 return;
13901
13902         kmem_free(dstate->dtds_base, dstate->dtds_size);
13903         kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13904 }
13905
13906 static void
13907 dtrace_vstate_fini(dtrace_vstate_t *vstate)
13908 {
13909         /*
13910          * Logical XOR, where are you?
13911          */
13912         ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13913
13914         if (vstate->dtvs_nglobals > 0) {
13915                 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13916                     sizeof (dtrace_statvar_t *));
13917         }
13918
13919         if (vstate->dtvs_ntlocals > 0) {
13920                 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13921                     sizeof (dtrace_difv_t));
13922         }
13923
13924         ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13925
13926         if (vstate->dtvs_nlocals > 0) {
13927                 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13928                     sizeof (dtrace_statvar_t *));
13929         }
13930 }
13931
13932 static void
13933 dtrace_state_clean(dtrace_state_t *state)
13934 {
13935         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13936                 return;
13937
13938         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13939         dtrace_speculation_clean(state);
13940 }
13941
13942 static void
13943 dtrace_state_deadman(dtrace_state_t *state)
13944 {
13945         hrtime_t now;
13946
13947         dtrace_sync();
13948
13949         now = dtrace_gethrtime();
13950
13951         if (state != dtrace_anon.dta_state &&
13952             now - state->dts_laststatus >= dtrace_deadman_user)
13953                 return;
13954
13955         /*
13956          * We must be sure that dts_alive never appears to be less than the
13957          * value upon entry to dtrace_state_deadman(), and because we lack a
13958          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
13959          * store INT64_MAX to it, followed by a memory barrier, followed by
13960          * the new value.  This assures that dts_alive never appears to be
13961          * less than its true value, regardless of the order in which the
13962          * stores to the underlying storage are issued.
13963          */
13964         state->dts_alive = INT64_MAX;
13965         dtrace_membar_producer();
13966         state->dts_alive = now;
13967 }
13968
13969 static int
13970 dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
13971 {
13972         minor_t minor;
13973         major_t major;
13974         char c[30];
13975         dtrace_state_t *state;
13976         dtrace_optval_t *opt;
13977         int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
13978         unsigned int cpu_it;
13979
13980         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13981         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13982
13983         /* Cause restart */
13984         *new_state = NULL;
13985
13986         if (devp != NULL) {
13987                 minor = getminor(*devp);
13988         }
13989         else {
13990                 minor = DTRACE_NCLIENTS - 1;
13991         }
13992
13993         state = dtrace_state_allocate(minor);
13994         if (NULL == state) {
13995                 printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor);
13996                 return (ERESTART);      /* can't reacquire */
13997         }
13998
13999         state->dts_epid = DTRACE_EPIDNONE + 1;
14000
14001         (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
14002         state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14003             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14004
14005         if (devp != NULL) {
14006                 major = getemajor(*devp);
14007         } else {
14008                 major = ddi_driver_major(dtrace_devi);
14009         }
14010
14011         state->dts_dev = makedev(major, minor);
14012
14013         if (devp != NULL)
14014                 *devp = state->dts_dev;
14015
14016         /*
14017          * We allocate NCPU buffers.  On the one hand, this can be quite
14018          * a bit of memory per instance (nearly 36K on a Starcat).  On the
14019          * other hand, it saves an additional memory reference in the probe
14020          * path.
14021          */
14022         state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14023         state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14024         state->dts_buf_over_limit = 0;
14025
14026         /*
14027          * Allocate and initialise the per-process per-CPU random state.
14028          * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
14029          * assumed to be seeded at this point (if from Fortuna seed file).
14030          */
14031         state->dts_rstate = kmem_zalloc(NCPU * sizeof(uint64_t*), KM_SLEEP);
14032         state->dts_rstate[0] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14033         (void) read_random(state->dts_rstate[0], 2 * sizeof(uint64_t));
14034         for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
14035                 state->dts_rstate[cpu_it] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14036                 /*
14037                  * Each CPU is assigned a 2^64 period, non-overlapping
14038                  * subsequence.
14039                  */
14040                 dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
14041                     state->dts_rstate[cpu_it]);
14042         }
14043
14044         state->dts_cleaner = CYCLIC_NONE;
14045         state->dts_deadman = CYCLIC_NONE;
14046         state->dts_vstate.dtvs_state = state;
14047
14048         for (i = 0; i < DTRACEOPT_MAX; i++)
14049                 state->dts_options[i] = DTRACEOPT_UNSET;
14050
14051         /*
14052          * Set the default options.
14053          */
14054         opt = state->dts_options;
14055         opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14056         opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14057         opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14058         opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14059         opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14060         opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14061         opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14062         opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14063         opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14064         opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14065         opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14066         opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14067         opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14068         opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14069         opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default;
14070
14071         /*
14072          * Depending on the user credentials, we set flag bits which alter probe
14073          * visibility or the amount of destructiveness allowed.  In the case of
14074          * actual anonymous tracing, or the possession of all privileges, all of
14075          * the normal checks are bypassed.
14076          */
14077 #if defined(__APPLE__)
14078         if (cr != NULL) {
14079                 kauth_cred_ref(cr);
14080                 state->dts_cred.dcr_cred = cr;
14081         }
14082         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14083                 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14084                         /*
14085                          * Allow only proc credentials when DTrace is
14086                          * restricted by the current security policy
14087                          */
14088                         state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC;
14089                         state->dts_cred.dcr_action = DTRACE_CRA_PROC | DTRACE_CRA_PROC_CONTROL | DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14090                 }
14091                 else {
14092                         state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14093                         state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14094                 }
14095         }
14096
14097 #else
14098         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14099                 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14100                 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14101         }
14102         else {
14103                 /*
14104                  * Set up the credentials for this instantiation.  We take a
14105                  * hold on the credential to prevent it from disappearing on
14106                  * us; this in turn prevents the zone_t referenced by this
14107                  * credential from disappearing.  This means that we can
14108                  * examine the credential and the zone from probe context.
14109                  */
14110                 crhold(cr);
14111                 state->dts_cred.dcr_cred = cr;
14112
14113                 /*
14114                  * CRA_PROC means "we have *some* privilege for dtrace" and
14115                  * unlocks the use of variables like pid, zonename, etc.
14116                  */
14117                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14118                     PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14119                         state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14120                 }
14121
14122                 /*
14123                  * dtrace_user allows use of syscall and profile providers.
14124                  * If the user also has proc_owner and/or proc_zone, we
14125                  * extend the scope to include additional visibility and
14126                  * destructive power.
14127                  */
14128                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14129                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14130                                 state->dts_cred.dcr_visible |=
14131                                     DTRACE_CRV_ALLPROC;
14132
14133                                 state->dts_cred.dcr_action |=
14134                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14135                         }
14136
14137                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14138                                 state->dts_cred.dcr_visible |=
14139                                     DTRACE_CRV_ALLZONE;
14140
14141                                 state->dts_cred.dcr_action |=
14142                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14143                         }
14144
14145                         /*
14146                          * If we have all privs in whatever zone this is,
14147                          * we can do destructive things to processes which
14148                          * have altered credentials.
14149                          *
14150                          * APPLE NOTE: Darwin doesn't do zones.
14151                          * Behave as if zone always has destructive privs.
14152                          */
14153
14154                         state->dts_cred.dcr_action |=
14155                                 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14156                 }
14157
14158                 /*
14159                  * Holding the dtrace_kernel privilege also implies that
14160                  * the user has the dtrace_user privilege from a visibility
14161                  * perspective.  But without further privileges, some
14162                  * destructive actions are not available.
14163                  */
14164                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14165                         /*
14166                          * Make all probes in all zones visible.  However,
14167                          * this doesn't mean that all actions become available
14168                          * to all zones.
14169                          */
14170                         state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14171                             DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14172
14173                         state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14174                             DTRACE_CRA_PROC;
14175                         /*
14176                          * Holding proc_owner means that destructive actions
14177                          * for *this* zone are allowed.
14178                          */
14179                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14180                                 state->dts_cred.dcr_action |=
14181                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14182
14183                         /*
14184                          * Holding proc_zone means that destructive actions
14185                          * for this user/group ID in all zones is allowed.
14186                          */
14187                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14188                                 state->dts_cred.dcr_action |=
14189                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14190
14191                         /*
14192                          * If we have all privs in whatever zone this is,
14193                          * we can do destructive things to processes which
14194                          * have altered credentials.
14195                          *
14196                          * APPLE NOTE: Darwin doesn't do zones.
14197                          * Behave as if zone always has destructive privs.
14198                          */
14199                         state->dts_cred.dcr_action |=
14200                                 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14201                 }
14202
14203                 /*
14204                  * Holding the dtrace_proc privilege gives control over fasttrap
14205                  * and pid providers.  We need to grant wider destructive
14206                  * privileges in the event that the user has proc_owner and/or
14207                  * proc_zone.
14208                  */
14209                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14210                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14211                                 state->dts_cred.dcr_action |=
14212                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14213
14214                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14215                                 state->dts_cred.dcr_action |=
14216                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14217                 }
14218         }
14219 #endif
14220
14221         *new_state = state;
14222         return(0);  /* Success */
14223 }
14224
14225 static int
14226 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14227 {
14228         dtrace_optval_t *opt = state->dts_options, size;
14229         processorid_t cpu = 0;
14230         size_t limit = buf->dtb_size;
14231         int flags = 0, rval;
14232
14233         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14234         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14235         ASSERT(which < DTRACEOPT_MAX);
14236         ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14237             (state == dtrace_anon.dta_state &&
14238             state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14239
14240         if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14241                 return (0);
14242
14243         if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14244                 cpu = opt[DTRACEOPT_CPU];
14245
14246         if (which == DTRACEOPT_SPECSIZE)
14247                 flags |= DTRACEBUF_NOSWITCH;
14248
14249         if (which == DTRACEOPT_BUFSIZE) {
14250                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14251                         flags |= DTRACEBUF_RING;
14252
14253                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14254                         flags |= DTRACEBUF_FILL;
14255
14256                 if (state != dtrace_anon.dta_state ||
14257                     state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14258                         flags |= DTRACEBUF_INACTIVE;
14259         }
14260
14261         for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
14262                 /*
14263                  * The size must be 8-byte aligned.  If the size is not 8-byte
14264                  * aligned, drop it down by the difference.
14265                  */
14266                 if (size & (sizeof (uint64_t) - 1))
14267                         size -= size & (sizeof (uint64_t) - 1);
14268
14269                 if (size < state->dts_reserve) {
14270                         /*
14271                          * Buffers always must be large enough to accommodate
14272                          * their prereserved space.  We return E2BIG instead
14273                          * of ENOMEM in this case to allow for user-level
14274                          * software to differentiate the cases.
14275                          */
14276                         return (E2BIG);
14277                 }
14278                 limit = opt[DTRACEOPT_BUFLIMIT] * size / 100;
14279                 rval = dtrace_buffer_alloc(buf, limit, size, flags, cpu);
14280
14281                 if (rval != ENOMEM) {
14282                         opt[which] = size;
14283                         return (rval);
14284                 }
14285
14286                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14287                         return (rval);
14288         }
14289
14290         return (ENOMEM);
14291 }
14292
14293 static int
14294 dtrace_state_buffers(dtrace_state_t *state)
14295 {
14296         dtrace_speculation_t *spec = state->dts_speculations;
14297         int rval, i;
14298
14299         if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14300             DTRACEOPT_BUFSIZE)) != 0)
14301                 return (rval);
14302
14303         if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14304             DTRACEOPT_AGGSIZE)) != 0)
14305                 return (rval);
14306
14307         for (i = 0; i < state->dts_nspeculations; i++) {
14308                 if ((rval = dtrace_state_buffer(state,
14309                     spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14310                         return (rval);
14311         }
14312
14313         return (0);
14314 }
14315
14316 static void
14317 dtrace_state_prereserve(dtrace_state_t *state)
14318 {
14319         dtrace_ecb_t *ecb;
14320         dtrace_probe_t *probe;
14321
14322         state->dts_reserve = 0;
14323
14324         if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14325                 return;
14326
14327         /*
14328          * If our buffer policy is a "fill" buffer policy, we need to set the
14329          * prereserved space to be the space required by the END probes.
14330          */
14331         probe = dtrace_probes[dtrace_probeid_end - 1];
14332         ASSERT(probe != NULL);
14333
14334         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14335                 if (ecb->dte_state != state)
14336                         continue;
14337
14338                 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14339         }
14340 }
14341
14342 static int
14343 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14344 {
14345         dtrace_optval_t *opt = state->dts_options, sz, nspec;
14346         dtrace_speculation_t *spec;
14347         dtrace_buffer_t *buf;
14348         cyc_handler_t hdlr;
14349         cyc_time_t when;
14350         int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14351         dtrace_icookie_t cookie;
14352
14353         lck_mtx_lock(&cpu_lock);
14354         lck_mtx_lock(&dtrace_lock);
14355
14356         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14357                 rval = EBUSY;
14358                 goto out;
14359         }
14360
14361         /*
14362          * Before we can perform any checks, we must prime all of the
14363          * retained enablings that correspond to this state.
14364          */
14365         dtrace_enabling_prime(state);
14366
14367         if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14368                 rval = EACCES;
14369                 goto out;
14370         }
14371
14372         dtrace_state_prereserve(state);
14373
14374         /*
14375          * Now we want to do is try to allocate our speculations.
14376          * We do not automatically resize the number of speculations; if
14377          * this fails, we will fail the operation.
14378          */
14379         nspec = opt[DTRACEOPT_NSPEC];
14380         ASSERT(nspec != DTRACEOPT_UNSET);
14381
14382         if (nspec > INT_MAX) {
14383                 rval = ENOMEM;
14384                 goto out;
14385         }
14386
14387         spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
14388
14389         if (spec == NULL) {
14390                 rval = ENOMEM;
14391                 goto out;
14392         }
14393
14394         state->dts_speculations = spec;
14395         state->dts_nspeculations = (int)nspec;
14396
14397         for (i = 0; i < nspec; i++) {
14398                 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
14399                         rval = ENOMEM;
14400                         goto err;
14401                 }
14402
14403                 spec[i].dtsp_buffer = buf;
14404         }
14405
14406         if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14407                 if (dtrace_anon.dta_state == NULL) {
14408                         rval = ENOENT;
14409                         goto out;
14410                 }
14411
14412                 if (state->dts_necbs != 0) {
14413                         rval = EALREADY;
14414                         goto out;
14415                 }
14416
14417                 state->dts_anon = dtrace_anon_grab();
14418                 ASSERT(state->dts_anon != NULL);
14419                 state = state->dts_anon;
14420
14421                 /*
14422                  * We want "grabanon" to be set in the grabbed state, so we'll
14423                  * copy that option value from the grabbing state into the
14424                  * grabbed state.
14425                  */
14426                 state->dts_options[DTRACEOPT_GRABANON] =
14427                     opt[DTRACEOPT_GRABANON];
14428
14429                 *cpu = dtrace_anon.dta_beganon;
14430
14431                 /*
14432                  * If the anonymous state is active (as it almost certainly
14433                  * is if the anonymous enabling ultimately matched anything),
14434                  * we don't allow any further option processing -- but we
14435                  * don't return failure.
14436                  */
14437                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14438                         goto out;
14439         }
14440
14441         if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14442             opt[DTRACEOPT_AGGSIZE] != 0) {
14443                 if (state->dts_aggregations == NULL) {
14444                         /*
14445                          * We're not going to create an aggregation buffer
14446                          * because we don't have any ECBs that contain
14447                          * aggregations -- set this option to 0.
14448                          */
14449                         opt[DTRACEOPT_AGGSIZE] = 0;
14450                 } else {
14451                         /*
14452                          * If we have an aggregation buffer, we must also have
14453                          * a buffer to use as scratch.
14454                          */
14455                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14456                           (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14457                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14458                         }
14459                 }
14460         }
14461
14462         if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14463             opt[DTRACEOPT_SPECSIZE] != 0) {
14464                 if (!state->dts_speculates) {
14465                         /*
14466                          * We're not going to create speculation buffers
14467                          * because we don't have any ECBs that actually
14468                          * speculate -- set the speculation size to 0.
14469                          */
14470                         opt[DTRACEOPT_SPECSIZE] = 0;
14471                 }
14472         }
14473
14474         /*
14475          * The bare minimum size for any buffer that we're actually going to
14476          * do anything to is sizeof (uint64_t).
14477          */
14478         sz = sizeof (uint64_t);
14479
14480         if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14481             (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14482             (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14483                 /*
14484                  * A buffer size has been explicitly set to 0 (or to a size
14485                  * that will be adjusted to 0) and we need the space -- we
14486                  * need to return failure.  We return ENOSPC to differentiate
14487                  * it from failing to allocate a buffer due to failure to meet
14488                  * the reserve (for which we return E2BIG).
14489                  */
14490                 rval = ENOSPC;
14491                 goto out;
14492         }
14493
14494         if ((rval = dtrace_state_buffers(state)) != 0)
14495                 goto err;
14496
14497         if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14498                 sz = dtrace_dstate_defsize;
14499
14500         do {
14501                 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14502
14503                 if (rval == 0)
14504                         break;
14505
14506                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14507                         goto err;
14508         } while (sz >>= 1);
14509
14510         opt[DTRACEOPT_DYNVARSIZE] = sz;
14511
14512         if (rval != 0)
14513                 goto err;
14514
14515         if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14516                 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14517
14518         if (opt[DTRACEOPT_CLEANRATE] == 0)
14519                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14520
14521         if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14522                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14523
14524         if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14525                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14526
14527         if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max)
14528                 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max;
14529
14530         if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min)
14531                 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min;
14532
14533         if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max)
14534                 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max;
14535
14536         if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min)
14537                 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min;
14538
14539         hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14540         hdlr.cyh_arg = state;
14541         hdlr.cyh_level = CY_LOW_LEVEL;
14542
14543         when.cyt_when = 0;
14544         when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14545
14546         state->dts_cleaner = cyclic_add(&hdlr, &when);
14547
14548         hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14549         hdlr.cyh_arg = state;
14550         hdlr.cyh_level = CY_LOW_LEVEL;
14551
14552         when.cyt_when = 0;
14553         when.cyt_interval = dtrace_deadman_interval;
14554
14555         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14556         state->dts_deadman = cyclic_add(&hdlr, &when);
14557
14558         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14559
14560         /*
14561          * Now it's time to actually fire the BEGIN probe.  We need to disable
14562          * interrupts here both to record the CPU on which we fired the BEGIN
14563          * probe (the data from this CPU will be processed first at user
14564          * level) and to manually activate the buffer for this CPU.
14565          */
14566         cookie = dtrace_interrupt_disable();
14567         *cpu = CPU->cpu_id;
14568         ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14569         state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14570
14571         dtrace_probe(dtrace_probeid_begin,
14572             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14573         dtrace_interrupt_enable(cookie);
14574         /*
14575          * We may have had an exit action from a BEGIN probe; only change our
14576          * state to ACTIVE if we're still in WARMUP.
14577          */
14578         ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14579             state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14580
14581         if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14582                 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14583
14584         /*
14585          * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14586          * want each CPU to transition its principal buffer out of the
14587          * INACTIVE state.  Doing this assures that no CPU will suddenly begin
14588          * processing an ECB halfway down a probe's ECB chain; all CPUs will
14589          * atomically transition from processing none of a state's ECBs to
14590          * processing all of them.
14591          */
14592         dtrace_xcall(DTRACE_CPUALL,
14593             (dtrace_xcall_t)dtrace_buffer_activate, state);
14594         goto out;
14595
14596 err:
14597         dtrace_buffer_free(state->dts_buffer);
14598         dtrace_buffer_free(state->dts_aggbuffer);
14599
14600         if ((nspec = state->dts_nspeculations) == 0) {
14601                 ASSERT(state->dts_speculations == NULL);
14602                 goto out;
14603         }
14604
14605         spec = state->dts_speculations;
14606         ASSERT(spec != NULL);
14607
14608         for (i = 0; i < state->dts_nspeculations; i++) {
14609                 if ((buf = spec[i].dtsp_buffer) == NULL)
14610                         break;
14611
14612                 dtrace_buffer_free(buf);
14613                 kmem_free(buf, bufsize);
14614         }
14615
14616         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14617         state->dts_nspeculations = 0;
14618         state->dts_speculations = NULL;
14619
14620 out:
14621         lck_mtx_unlock(&dtrace_lock);
14622         lck_mtx_unlock(&cpu_lock);
14623
14624         return (rval);
14625 }
14626
14627 static int
14628 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14629 {
14630         dtrace_icookie_t cookie;
14631
14632         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14633
14634         if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14635             state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14636                 return (EINVAL);
14637
14638         /*
14639          * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14640          * to be sure that every CPU has seen it.  See below for the details
14641          * on why this is done.
14642          */
14643         state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14644         dtrace_sync();
14645
14646         /*
14647          * By this point, it is impossible for any CPU to be still processing
14648          * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
14649          * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14650          * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
14651          * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14652          * iff we're in the END probe.
14653          */
14654         state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14655         dtrace_sync();
14656         ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14657
14658         /*
14659          * Finally, we can release the reserve and call the END probe.  We
14660          * disable interrupts across calling the END probe to allow us to
14661          * return the CPU on which we actually called the END probe.  This
14662          * allows user-land to be sure that this CPU's principal buffer is
14663          * processed last.
14664          */
14665         state->dts_reserve = 0;
14666
14667         cookie = dtrace_interrupt_disable();
14668         *cpu = CPU->cpu_id;
14669         dtrace_probe(dtrace_probeid_end,
14670             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14671         dtrace_interrupt_enable(cookie);
14672
14673         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14674         dtrace_sync();
14675
14676         return (0);
14677 }
14678
14679 static int
14680 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14681     dtrace_optval_t val)
14682 {
14683         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14684
14685         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14686                 return (EBUSY);
14687
14688         if (option >= DTRACEOPT_MAX)
14689                 return (EINVAL);
14690
14691         if (option != DTRACEOPT_CPU && val < 0)
14692                 return (EINVAL);
14693
14694         switch (option) {
14695         case DTRACEOPT_DESTRUCTIVE:
14696                 /*
14697                  * Prevent consumers from enabling destructive actions if DTrace
14698                  * is running in a restricted environment, or if actions are
14699                  * disallowed.
14700                  */
14701                 if (dtrace_is_restricted() || dtrace_destructive_disallow)
14702                         return (EACCES);
14703
14704                 state->dts_cred.dcr_destructive = 1;
14705                 break;
14706
14707         case DTRACEOPT_BUFSIZE:
14708         case DTRACEOPT_DYNVARSIZE:
14709         case DTRACEOPT_AGGSIZE:
14710         case DTRACEOPT_SPECSIZE:
14711         case DTRACEOPT_STRSIZE:
14712                 if (val < 0)
14713                         return (EINVAL);
14714
14715                 if (val >= LONG_MAX) {
14716                         /*
14717                          * If this is an otherwise negative value, set it to
14718                          * the highest multiple of 128m less than LONG_MAX.
14719                          * Technically, we're adjusting the size without
14720                          * regard to the buffer resizing policy, but in fact,
14721                          * this has no effect -- if we set the buffer size to
14722                          * ~LONG_MAX and the buffer policy is ultimately set to
14723                          * be "manual", the buffer allocation is guaranteed to
14724                          * fail, if only because the allocation requires two
14725                          * buffers.  (We set the the size to the highest
14726                          * multiple of 128m because it ensures that the size
14727                          * will remain a multiple of a megabyte when
14728                          * repeatedly halved -- all the way down to 15m.)
14729                          */
14730                         val = LONG_MAX - (1 << 27) + 1;
14731                 }
14732         }
14733
14734         state->dts_options[option] = val;
14735
14736         return (0);
14737 }
14738
14739 static void
14740 dtrace_state_destroy(dtrace_state_t *state)
14741 {
14742         dtrace_ecb_t *ecb;
14743         dtrace_vstate_t *vstate = &state->dts_vstate;
14744         minor_t minor = getminor(state->dts_dev);
14745         int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14746         dtrace_speculation_t *spec = state->dts_speculations;
14747         int nspec = state->dts_nspeculations;
14748         uint32_t match;
14749
14750         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14751         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14752
14753         /*
14754          * First, retract any retained enablings for this state.
14755          */
14756         dtrace_enabling_retract(state);
14757         ASSERT(state->dts_nretained == 0);
14758
14759         if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14760             state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14761                 /*
14762                  * We have managed to come into dtrace_state_destroy() on a
14763                  * hot enabling -- almost certainly because of a disorderly
14764                  * shutdown of a consumer.  (That is, a consumer that is
14765                  * exiting without having called dtrace_stop().) In this case,
14766                  * we're going to set our activity to be KILLED, and then
14767                  * issue a sync to be sure that everyone is out of probe
14768                  * context before we start blowing away ECBs.
14769                  */
14770                 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14771                 dtrace_sync();
14772         }
14773
14774         /*
14775          * Release the credential hold we took in dtrace_state_create().
14776          */
14777         if (state->dts_cred.dcr_cred != NULL)
14778                 kauth_cred_unref(&state->dts_cred.dcr_cred);
14779
14780         /*
14781          * Now we can safely disable and destroy any enabled probes.  Because
14782          * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14783          * (especially if they're all enabled), we take two passes through the
14784          * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14785          * in the second we disable whatever is left over.
14786          */
14787         for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14788                 for (i = 0; i < state->dts_necbs; i++) {
14789                         if ((ecb = state->dts_ecbs[i]) == NULL)
14790                                 continue;
14791
14792                         if (match && ecb->dte_probe != NULL) {
14793                                 dtrace_probe_t *probe = ecb->dte_probe;
14794                                 dtrace_provider_t *prov = probe->dtpr_provider;
14795
14796                                 if (!(prov->dtpv_priv.dtpp_flags & match))
14797                                         continue;
14798                         }
14799
14800                         dtrace_ecb_disable(ecb);
14801                         dtrace_ecb_destroy(ecb);
14802                 }
14803
14804                 if (!match)
14805                         break;
14806         }
14807
14808         /*
14809          * Before we free the buffers, perform one more sync to assure that
14810          * every CPU is out of probe context.
14811          */
14812         dtrace_sync();
14813
14814         dtrace_buffer_free(state->dts_buffer);
14815         dtrace_buffer_free(state->dts_aggbuffer);
14816
14817         for (i = 0; i < (int)NCPU; i++) {
14818                 kmem_free(state->dts_rstate[i], 2 * sizeof(uint64_t));
14819         }
14820         kmem_free(state->dts_rstate, NCPU * sizeof(uint64_t*));
14821
14822         for (i = 0; i < nspec; i++)
14823                 dtrace_buffer_free(spec[i].dtsp_buffer);
14824
14825         if (state->dts_cleaner != CYCLIC_NONE)
14826                 cyclic_remove(state->dts_cleaner);
14827
14828         if (state->dts_deadman != CYCLIC_NONE)
14829                 cyclic_remove(state->dts_deadman);
14830
14831         dtrace_dstate_fini(&vstate->dtvs_dynvars);
14832         dtrace_vstate_fini(vstate);
14833         kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14834
14835         if (state->dts_aggregations != NULL) {
14836 #if DEBUG
14837                 for (i = 0; i < state->dts_naggregations; i++)
14838                         ASSERT(state->dts_aggregations[i] == NULL);
14839 #endif
14840                 ASSERT(state->dts_naggregations > 0);
14841                 kmem_free(state->dts_aggregations,
14842                     state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14843         }
14844
14845         kmem_free(state->dts_buffer, bufsize);
14846         kmem_free(state->dts_aggbuffer, bufsize);
14847
14848         for (i = 0; i < nspec; i++)
14849                 kmem_free(spec[i].dtsp_buffer, bufsize);
14850
14851         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14852
14853         dtrace_format_destroy(state);
14854
14855         vmem_destroy(state->dts_aggid_arena);
14856         dtrace_state_free(minor);
14857 }
14858
14859 /*
14860  * DTrace Anonymous Enabling Functions
14861  */
14862
14863 int
14864 dtrace_keep_kernel_symbols(void)
14865 {
14866         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14867                 return 0;
14868         }
14869
14870         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
14871                 return 1;
14872
14873         return 0;
14874 }
14875
14876 static dtrace_state_t *
14877 dtrace_anon_grab(void)
14878 {
14879         dtrace_state_t *state;
14880
14881         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14882
14883         if ((state = dtrace_anon.dta_state) == NULL) {
14884                 ASSERT(dtrace_anon.dta_enabling == NULL);
14885                 return (NULL);
14886         }
14887
14888         ASSERT(dtrace_anon.dta_enabling != NULL);
14889         ASSERT(dtrace_retained != NULL);
14890
14891         dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14892         dtrace_anon.dta_enabling = NULL;
14893         dtrace_anon.dta_state = NULL;
14894
14895         return (state);
14896 }
14897
14898 static void
14899 dtrace_anon_property(void)
14900 {
14901         int i, rv;
14902         dtrace_state_t *state;
14903         dof_hdr_t *dof;
14904         char c[32];             /* enough for "dof-data-" + digits */
14905
14906         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14907         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14908
14909         for (i = 0; ; i++) {
14910                 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
14911
14912                 dtrace_err_verbose = 1;
14913
14914                 if ((dof = dtrace_dof_property(c)) == NULL) {
14915                         dtrace_err_verbose = 0;
14916                         break;
14917                 }
14918
14919 #ifdef illumos
14920                 /*
14921                  * We want to create anonymous state, so we need to transition
14922                  * the kernel debugger to indicate that DTrace is active.  If
14923                  * this fails (e.g. because the debugger has modified text in
14924                  * some way), we won't continue with the processing.
14925                  */
14926                 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14927                         cmn_err(CE_NOTE, "kernel debugger active; anonymous "
14928                             "enabling ignored.");
14929                         dtrace_dof_destroy(dof);
14930                         break;
14931                 }
14932 #endif
14933
14934                 /*
14935                  * If we haven't allocated an anonymous state, we'll do so now.
14936                  */
14937                 if ((state = dtrace_anon.dta_state) == NULL) {
14938                         rv = dtrace_state_create(NULL, NULL, &state);
14939                         dtrace_anon.dta_state = state;
14940                         if (rv != 0 || state == NULL) {
14941                                 /*
14942                                  * This basically shouldn't happen:  the only
14943                                  * failure mode from dtrace_state_create() is a
14944                                  * failure of ddi_soft_state_zalloc() that
14945                                  * itself should never happen.  Still, the
14946                                  * interface allows for a failure mode, and
14947                                  * we want to fail as gracefully as possible:
14948                                  * we'll emit an error message and cease
14949                                  * processing anonymous state in this case.
14950                                  */
14951                                 cmn_err(CE_WARN, "failed to create "
14952                                     "anonymous state");
14953                                 dtrace_dof_destroy(dof);
14954                                 break;
14955                         }
14956                 }
14957
14958                 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
14959                     &dtrace_anon.dta_enabling, 0, B_TRUE);
14960
14961                 if (rv == 0)
14962                         rv = dtrace_dof_options(dof, state);
14963
14964                 dtrace_err_verbose = 0;
14965                 dtrace_dof_destroy(dof);
14966
14967                 if (rv != 0) {
14968                         /*
14969                          * This is malformed DOF; chuck any anonymous state
14970                          * that we created.
14971                          */
14972                         ASSERT(dtrace_anon.dta_enabling == NULL);
14973                         dtrace_state_destroy(state);
14974                         dtrace_anon.dta_state = NULL;
14975                         break;
14976                 }
14977
14978                 ASSERT(dtrace_anon.dta_enabling != NULL);
14979         }
14980
14981         if (dtrace_anon.dta_enabling != NULL) {
14982                 int rval;
14983
14984                 /*
14985                  * dtrace_enabling_retain() can only fail because we are
14986                  * trying to retain more enablings than are allowed -- but
14987                  * we only have one anonymous enabling, and we are guaranteed
14988                  * to be allowed at least one retained enabling; we assert
14989                  * that dtrace_enabling_retain() returns success.
14990                  */
14991                 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
14992                 ASSERT(rval == 0);
14993
14994                 dtrace_enabling_dump(dtrace_anon.dta_enabling);
14995         }
14996 }
14997
14998 /*
14999  * DTrace Helper Functions
15000  */
15001 static void
15002 dtrace_helper_trace(dtrace_helper_action_t *helper,
15003     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15004 {
15005         uint32_t size, next, nnext;
15006         int i;
15007         dtrace_helptrace_t *ent;
15008         uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15009
15010         if (!dtrace_helptrace_enabled)
15011                 return;
15012
15013         ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15014
15015         /*
15016          * What would a tracing framework be without its own tracing
15017          * framework?  (Well, a hell of a lot simpler, for starters...)
15018          */
15019         size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15020             sizeof (uint64_t) - sizeof (uint64_t);
15021
15022         /*
15023          * Iterate until we can allocate a slot in the trace buffer.
15024          */
15025         do {
15026                 next = dtrace_helptrace_next;
15027
15028                 if (next + size < dtrace_helptrace_bufsize) {
15029                         nnext = next + size;
15030                 } else {
15031                         nnext = size;
15032                 }
15033         } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15034
15035         /*
15036          * We have our slot; fill it in.
15037          */
15038         if (nnext == size)
15039                 next = 0;
15040
15041         ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15042         ent->dtht_helper = helper;
15043         ent->dtht_where = where;
15044         ent->dtht_nlocals = vstate->dtvs_nlocals;
15045
15046         ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15047             mstate->dtms_fltoffs : -1;
15048         ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15049         ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
15050
15051         for (i = 0; i < vstate->dtvs_nlocals; i++) {
15052                 dtrace_statvar_t *svar;
15053
15054                 if ((svar = vstate->dtvs_locals[i]) == NULL)
15055                         continue;
15056
15057                 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
15058                 ent->dtht_locals[i] =
15059                     ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
15060         }
15061 }
15062
15063 static uint64_t
15064 dtrace_helper(int which, dtrace_mstate_t *mstate,
15065     dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15066 {
15067         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15068         uint64_t sarg0 = mstate->dtms_arg[0];
15069         uint64_t sarg1 = mstate->dtms_arg[1];
15070         uint64_t rval = 0;
15071         dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15072         dtrace_helper_action_t *helper;
15073         dtrace_vstate_t *vstate;
15074         dtrace_difo_t *pred;
15075         int i, trace = dtrace_helptrace_enabled;
15076
15077         ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15078
15079         if (helpers == NULL)
15080                 return (0);
15081
15082         if ((helper = helpers->dthps_actions[which]) == NULL)
15083                 return (0);
15084
15085         vstate = &helpers->dthps_vstate;
15086         mstate->dtms_arg[0] = arg0;
15087         mstate->dtms_arg[1] = arg1;
15088
15089         /*
15090          * Now iterate over each helper.  If its predicate evaluates to 'true',
15091          * we'll call the corresponding actions.  Note that the below calls
15092          * to dtrace_dif_emulate() may set faults in machine state.  This is
15093          * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
15094          * the stored DIF offset with its own (which is the desired behavior).
15095          * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15096          * from machine state; this is okay, too.
15097          */
15098         for (; helper != NULL; helper = helper->dtha_next) {
15099                 if ((pred = helper->dtha_predicate) != NULL) {
15100                         if (trace)
15101                                 dtrace_helper_trace(helper, mstate, vstate, 0);
15102
15103                         if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15104                                 goto next;
15105
15106                         if (*flags & CPU_DTRACE_FAULT)
15107                                 goto err;
15108                 }
15109
15110                 for (i = 0; i < helper->dtha_nactions; i++) {
15111                         if (trace)
15112                                 dtrace_helper_trace(helper,
15113                                     mstate, vstate, i + 1);
15114
15115                         rval = dtrace_dif_emulate(helper->dtha_actions[i],
15116                             mstate, vstate, state);
15117
15118                         if (*flags & CPU_DTRACE_FAULT)
15119                                 goto err;
15120                 }
15121
15122 next:
15123                 if (trace)
15124                         dtrace_helper_trace(helper, mstate, vstate,
15125                             DTRACE_HELPTRACE_NEXT);
15126         }
15127
15128         if (trace)
15129                 dtrace_helper_trace(helper, mstate, vstate,
15130                     DTRACE_HELPTRACE_DONE);
15131
15132         /*
15133          * Restore the arg0 that we saved upon entry.
15134          */
15135         mstate->dtms_arg[0] = sarg0;
15136         mstate->dtms_arg[1] = sarg1;
15137
15138         return (rval);
15139
15140 err:
15141         if (trace)
15142                 dtrace_helper_trace(helper, mstate, vstate,
15143                     DTRACE_HELPTRACE_ERR);
15144
15145         /*
15146          * Restore the arg0 that we saved upon entry.
15147          */
15148         mstate->dtms_arg[0] = sarg0;
15149         mstate->dtms_arg[1] = sarg1;
15150
15151         return (0);
15152 }
15153
15154 static void
15155 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15156     dtrace_vstate_t *vstate)
15157 {
15158         int i;
15159
15160         if (helper->dtha_predicate != NULL)
15161                 dtrace_difo_release(helper->dtha_predicate, vstate);
15162
15163         for (i = 0; i < helper->dtha_nactions; i++) {
15164                 ASSERT(helper->dtha_actions[i] != NULL);
15165                 dtrace_difo_release(helper->dtha_actions[i], vstate);
15166         }
15167
15168         kmem_free(helper->dtha_actions,
15169             helper->dtha_nactions * sizeof (dtrace_difo_t *));
15170         kmem_free(helper, sizeof (dtrace_helper_action_t));
15171 }
15172
15173 static int
15174 dtrace_helper_destroygen(proc_t* p, int gen)
15175 {
15176         dtrace_helpers_t *help = p->p_dtrace_helpers;
15177         dtrace_vstate_t *vstate;
15178         uint_t i;
15179
15180         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15181         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15182
15183         if (help == NULL || gen > help->dthps_generation)
15184                 return (EINVAL);
15185
15186         vstate = &help->dthps_vstate;
15187
15188         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15189                 dtrace_helper_action_t *last = NULL, *h, *next;
15190
15191                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15192                         next = h->dtha_next;
15193
15194                         if (h->dtha_generation == gen) {
15195                                 if (last != NULL) {
15196                                         last->dtha_next = next;
15197                                 } else {
15198                                         help->dthps_actions[i] = next;
15199                                 }
15200
15201                                 dtrace_helper_action_destroy(h, vstate);
15202                         } else {
15203                                 last = h;
15204                         }
15205                 }
15206         }
15207
15208         /*
15209          * Interate until we've cleared out all helper providers with the
15210          * given generation number.
15211          */
15212         for (;;) {
15213                 dtrace_helper_provider_t *prov = NULL;
15214
15215                 /*
15216                  * Look for a helper provider with the right generation. We
15217                  * have to start back at the beginning of the list each time
15218                  * because we drop dtrace_lock. It's unlikely that we'll make
15219                  * more than two passes.
15220                  */
15221                 for (i = 0; i < help->dthps_nprovs; i++) {
15222                         prov = help->dthps_provs[i];
15223
15224                         if (prov->dthp_generation == gen)
15225                                 break;
15226                 }
15227
15228                 /*
15229                  * If there were no matches, we're done.
15230                  */
15231                 if (i == help->dthps_nprovs)
15232                         break;
15233
15234                 /*
15235                  * Move the last helper provider into this slot.
15236                  */
15237                 help->dthps_nprovs--;
15238                 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15239                 help->dthps_provs[help->dthps_nprovs] = NULL;
15240
15241                 lck_mtx_unlock(&dtrace_lock);
15242
15243                 /*
15244                  * If we have a meta provider, remove this helper provider.
15245                  */
15246                 if (dtrace_meta_pid != NULL) {
15247                         ASSERT(dtrace_deferred_pid == NULL);
15248                         dtrace_helper_provider_remove(&prov->dthp_prov,
15249                             p);
15250                 }
15251
15252                 dtrace_helper_provider_destroy(prov);
15253
15254                 lck_mtx_lock(&dtrace_lock);
15255         }
15256
15257         return (0);
15258 }
15259
15260 static int
15261 dtrace_helper_validate(dtrace_helper_action_t *helper)
15262 {
15263         int err = 0, i;
15264         dtrace_difo_t *dp;
15265
15266         if ((dp = helper->dtha_predicate) != NULL)
15267                 err += dtrace_difo_validate_helper(dp);
15268
15269         for (i = 0; i < helper->dtha_nactions; i++)
15270                 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15271
15272         return (err == 0);
15273 }
15274
15275 static int
15276 dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
15277 {
15278         dtrace_helpers_t *help;
15279         dtrace_helper_action_t *helper, *last;
15280         dtrace_actdesc_t *act;
15281         dtrace_vstate_t *vstate;
15282         dtrace_predicate_t *pred;
15283         int count = 0, nactions = 0, i;
15284
15285         if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15286                 return (EINVAL);
15287
15288         help = p->p_dtrace_helpers;
15289         last = help->dthps_actions[which];
15290         vstate = &help->dthps_vstate;
15291
15292         for (count = 0; last != NULL; last = last->dtha_next) {
15293                 count++;
15294                 if (last->dtha_next == NULL)
15295                         break;
15296         }
15297
15298         /*
15299          * If we already have dtrace_helper_actions_max helper actions for this
15300          * helper action type, we'll refuse to add a new one.
15301          */
15302         if (count >= dtrace_helper_actions_max)
15303                 return (ENOSPC);
15304
15305         helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15306         helper->dtha_generation = help->dthps_generation;
15307
15308         if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15309                 ASSERT(pred->dtp_difo != NULL);
15310                 dtrace_difo_hold(pred->dtp_difo);
15311                 helper->dtha_predicate = pred->dtp_difo;
15312         }
15313
15314         for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15315                 if (act->dtad_kind != DTRACEACT_DIFEXPR)
15316                         goto err;
15317
15318                 if (act->dtad_difo == NULL)
15319                         goto err;
15320
15321                 nactions++;
15322         }
15323
15324         helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15325             (helper->dtha_nactions = nactions), KM_SLEEP);
15326
15327         for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15328                 dtrace_difo_hold(act->dtad_difo);
15329                 helper->dtha_actions[i++] = act->dtad_difo;
15330         }
15331
15332         if (!dtrace_helper_validate(helper))
15333                 goto err;
15334
15335         if (last == NULL) {
15336                 help->dthps_actions[which] = helper;
15337         } else {
15338                 last->dtha_next = helper;
15339         }
15340
15341         if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15342                 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15343                 dtrace_helptrace_next = 0;
15344         }
15345
15346         return (0);
15347 err:
15348         dtrace_helper_action_destroy(helper, vstate);
15349         return (EINVAL);
15350 }
15351
15352 static void
15353 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15354     dof_helper_t *dofhp)
15355 {
15356         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15357         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15358
15359         lck_mtx_lock(&dtrace_lock);
15360
15361         if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15362                 /*
15363                  * If the dtrace module is loaded but not attached, or if
15364                  * there aren't isn't a meta provider registered to deal with
15365                  * these provider descriptions, we need to postpone creating
15366                  * the actual providers until later.
15367                  */
15368
15369                 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15370                     dtrace_deferred_pid != help) {
15371                         help->dthps_deferred = 1;
15372                         help->dthps_pid = p->p_pid;
15373                         help->dthps_next = dtrace_deferred_pid;
15374                         help->dthps_prev = NULL;
15375                         if (dtrace_deferred_pid != NULL)
15376                                 dtrace_deferred_pid->dthps_prev = help;
15377                         dtrace_deferred_pid = help;
15378                 }
15379
15380                 lck_mtx_unlock(&dtrace_lock);
15381
15382         } else if (dofhp != NULL) {
15383                 /*
15384                  * If the dtrace module is loaded and we have a particular
15385                  * helper provider description, pass that off to the
15386                  * meta provider.
15387                  */
15388
15389                 lck_mtx_unlock(&dtrace_lock);
15390
15391                 dtrace_helper_provide(dofhp, p);
15392
15393         } else {
15394                 /*
15395                  * Otherwise, just pass all the helper provider descriptions
15396                  * off to the meta provider.
15397                  */
15398
15399                 uint_t i;
15400                 lck_mtx_unlock(&dtrace_lock);
15401
15402                 for (i = 0; i < help->dthps_nprovs; i++) {
15403                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15404                                 p);
15405                 }
15406         }
15407 }
15408
15409 static int
15410 dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
15411 {
15412         dtrace_helpers_t *help;
15413         dtrace_helper_provider_t *hprov, **tmp_provs;
15414         uint_t tmp_maxprovs, i;
15415
15416         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15417         help = p->p_dtrace_helpers;
15418         ASSERT(help != NULL);
15419
15420         /*
15421          * If we already have dtrace_helper_providers_max helper providers,
15422          * we're refuse to add a new one.
15423          */
15424         if (help->dthps_nprovs >= dtrace_helper_providers_max)
15425                 return (ENOSPC);
15426
15427         /*
15428          * Check to make sure this isn't a duplicate.
15429          */
15430         for (i = 0; i < help->dthps_nprovs; i++) {
15431                 if (dofhp->dofhp_addr ==
15432                     help->dthps_provs[i]->dthp_prov.dofhp_addr)
15433                         return (EALREADY);
15434         }
15435
15436         hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15437         hprov->dthp_prov = *dofhp;
15438         hprov->dthp_ref = 1;
15439         hprov->dthp_generation = gen;
15440
15441         /*
15442          * Allocate a bigger table for helper providers if it's already full.
15443          */
15444         if (help->dthps_maxprovs == help->dthps_nprovs) {
15445                 tmp_maxprovs = help->dthps_maxprovs;
15446                 tmp_provs = help->dthps_provs;
15447
15448                 if (help->dthps_maxprovs == 0)
15449                         help->dthps_maxprovs = 2;
15450                 else
15451                         help->dthps_maxprovs *= 2;
15452                 if (help->dthps_maxprovs > dtrace_helper_providers_max)
15453                         help->dthps_maxprovs = dtrace_helper_providers_max;
15454
15455                 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15456
15457                 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15458                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15459
15460                 if (tmp_provs != NULL) {
15461                         bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15462                             sizeof (dtrace_helper_provider_t *));
15463                         kmem_free(tmp_provs, tmp_maxprovs *
15464                             sizeof (dtrace_helper_provider_t *));
15465                 }
15466         }
15467
15468         help->dthps_provs[help->dthps_nprovs] = hprov;
15469         help->dthps_nprovs++;
15470
15471         return (0);
15472 }
15473
15474 static void
15475 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15476 {
15477         lck_mtx_lock(&dtrace_lock);
15478
15479         if (--hprov->dthp_ref == 0) {
15480                 dof_hdr_t *dof;
15481                 lck_mtx_unlock(&dtrace_lock);
15482                 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15483                 dtrace_dof_destroy(dof);
15484                 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15485         } else {
15486                 lck_mtx_unlock(&dtrace_lock);
15487         }
15488 }
15489
15490 static int
15491 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15492 {
15493         uintptr_t daddr = (uintptr_t)dof;
15494         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15495         dof_provider_t *provider;
15496         dof_probe_t *probe;
15497         uint8_t *arg;
15498         char *strtab, *typestr;
15499         dof_stridx_t typeidx;
15500         size_t typesz;
15501         uint_t nprobes, j, k;
15502
15503         ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15504
15505         if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15506                 dtrace_dof_error(dof, "misaligned section offset");
15507                 return (-1);
15508         }
15509
15510         /*
15511          * The section needs to be large enough to contain the DOF provider
15512          * structure appropriate for the given version.
15513          */
15514         if (sec->dofs_size <
15515             ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15516             offsetof(dof_provider_t, dofpv_prenoffs) :
15517             sizeof (dof_provider_t))) {
15518                 dtrace_dof_error(dof, "provider section too small");
15519                 return (-1);
15520         }
15521
15522         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15523         str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15524         prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15525         arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15526         off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15527
15528         if (str_sec == NULL || prb_sec == NULL ||
15529             arg_sec == NULL || off_sec == NULL)
15530                 return (-1);
15531
15532         enoff_sec = NULL;
15533
15534         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15535             provider->dofpv_prenoffs != DOF_SECT_NONE &&
15536             (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15537             provider->dofpv_prenoffs)) == NULL)
15538                 return (-1);
15539
15540         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15541
15542         if (provider->dofpv_name >= str_sec->dofs_size ||
15543             strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15544                 dtrace_dof_error(dof, "invalid provider name");
15545                 return (-1);
15546         }
15547
15548         if (prb_sec->dofs_entsize == 0 ||
15549             prb_sec->dofs_entsize > prb_sec->dofs_size) {
15550                 dtrace_dof_error(dof, "invalid entry size");
15551                 return (-1);
15552         }
15553
15554         if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15555                 dtrace_dof_error(dof, "misaligned entry size");
15556                 return (-1);
15557         }
15558
15559         if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15560                 dtrace_dof_error(dof, "invalid entry size");
15561                 return (-1);
15562         }
15563
15564         if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15565                 dtrace_dof_error(dof, "misaligned section offset");
15566                 return (-1);
15567         }
15568
15569         if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15570                 dtrace_dof_error(dof, "invalid entry size");
15571                 return (-1);
15572         }
15573
15574         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15575
15576         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15577
15578         /*
15579          * Take a pass through the probes to check for errors.
15580          */
15581         for (j = 0; j < nprobes; j++) {
15582                 probe = (dof_probe_t *)(uintptr_t)(daddr +
15583                     prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15584
15585                 if (probe->dofpr_func >= str_sec->dofs_size) {
15586                         dtrace_dof_error(dof, "invalid function name");
15587                         return (-1);
15588                 }
15589
15590                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15591                         dtrace_dof_error(dof, "function name too long");
15592                         return (-1);
15593                 }
15594
15595                 if (probe->dofpr_name >= str_sec->dofs_size ||
15596                     strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15597                         dtrace_dof_error(dof, "invalid probe name");
15598                         return (-1);
15599                 }
15600
15601                 /*
15602                  * The offset count must not wrap the index, and the offsets
15603                  * must also not overflow the section's data.
15604                  */
15605                 if (probe->dofpr_offidx + probe->dofpr_noffs <
15606                     probe->dofpr_offidx ||
15607                     (probe->dofpr_offidx + probe->dofpr_noffs) *
15608                     off_sec->dofs_entsize > off_sec->dofs_size) {
15609                         dtrace_dof_error(dof, "invalid probe offset");
15610                         return (-1);
15611                 }
15612
15613                 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15614                         /*
15615                          * If there's no is-enabled offset section, make sure
15616                          * there aren't any is-enabled offsets. Otherwise
15617                          * perform the same checks as for probe offsets
15618                          * (immediately above).
15619                          */
15620                         if (enoff_sec == NULL) {
15621                                 if (probe->dofpr_enoffidx != 0 ||
15622                                     probe->dofpr_nenoffs != 0) {
15623                                         dtrace_dof_error(dof, "is-enabled "
15624                                             "offsets with null section");
15625                                         return (-1);
15626                                 }
15627                         } else if (probe->dofpr_enoffidx +
15628                             probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15629                             (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15630                             enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15631                                 dtrace_dof_error(dof, "invalid is-enabled "
15632                                     "offset");
15633                                 return (-1);
15634                         }
15635
15636                         if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15637                                 dtrace_dof_error(dof, "zero probe and "
15638                                     "is-enabled offsets");
15639                                 return (-1);
15640                         }
15641                 } else if (probe->dofpr_noffs == 0) {
15642                         dtrace_dof_error(dof, "zero probe offsets");
15643                         return (-1);
15644                 }
15645
15646                 if (probe->dofpr_argidx + probe->dofpr_xargc <
15647                     probe->dofpr_argidx ||
15648                     (probe->dofpr_argidx + probe->dofpr_xargc) *
15649                     arg_sec->dofs_entsize > arg_sec->dofs_size) {
15650                         dtrace_dof_error(dof, "invalid args");
15651                         return (-1);
15652                 }
15653
15654                 typeidx = probe->dofpr_nargv;
15655                 typestr = strtab + probe->dofpr_nargv;
15656                 for (k = 0; k < probe->dofpr_nargc; k++) {
15657                         if (typeidx >= str_sec->dofs_size) {
15658                                 dtrace_dof_error(dof, "bad "
15659                                     "native argument type");
15660                                 return (-1);
15661                         }
15662
15663                         typesz = strlen(typestr) + 1;
15664                         if (typesz > DTRACE_ARGTYPELEN) {
15665                                 dtrace_dof_error(dof, "native "
15666                                     "argument type too long");
15667                                 return (-1);
15668                         }
15669                         typeidx += typesz;
15670                         typestr += typesz;
15671                 }
15672
15673                 typeidx = probe->dofpr_xargv;
15674                 typestr = strtab + probe->dofpr_xargv;
15675                 for (k = 0; k < probe->dofpr_xargc; k++) {
15676                         if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15677                                 dtrace_dof_error(dof, "bad "
15678                                     "native argument index");
15679                                 return (-1);
15680                         }
15681
15682                         if (typeidx >= str_sec->dofs_size) {
15683                                 dtrace_dof_error(dof, "bad "
15684                                     "translated argument type");
15685                                 return (-1);
15686                         }
15687
15688                         typesz = strlen(typestr) + 1;
15689                         if (typesz > DTRACE_ARGTYPELEN) {
15690                                 dtrace_dof_error(dof, "translated argument "
15691                                     "type too long");
15692                                 return (-1);
15693                         }
15694
15695                         typeidx += typesz;
15696                         typestr += typesz;
15697                 }
15698         }
15699
15700         return (0);
15701 }
15702
15703 static int
15704 dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
15705 {
15706         dtrace_helpers_t *help;
15707         dtrace_vstate_t *vstate;
15708         dtrace_enabling_t *enab = NULL;
15709         int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15710         uintptr_t daddr = (uintptr_t)dof;
15711
15712         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15713         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15714
15715         if ((help = p->p_dtrace_helpers) == NULL)
15716                 help = dtrace_helpers_create(p);
15717
15718         vstate = &help->dthps_vstate;
15719
15720         if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15721             dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15722                 dtrace_dof_destroy(dof);
15723                 return (rv);
15724         }
15725
15726         /*
15727          * Look for helper providers and validate their descriptions.
15728          */
15729         if (dhp != NULL) {
15730                 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
15731                         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15732                             dof->dofh_secoff + i * dof->dofh_secsize);
15733
15734                         if (sec->dofs_type != DOF_SECT_PROVIDER)
15735                                 continue;
15736
15737                         if (dtrace_helper_provider_validate(dof, sec) != 0) {
15738                                 dtrace_enabling_destroy(enab);
15739                                 dtrace_dof_destroy(dof);
15740                                 return (-1);
15741                         }
15742
15743                         nprovs++;
15744                 }
15745         }
15746
15747         /*
15748          * Now we need to walk through the ECB descriptions in the enabling.
15749          */
15750         for (i = 0; i < enab->dten_ndesc; i++) {
15751                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15752                 dtrace_probedesc_t *desc = &ep->dted_probe;
15753
15754                 /* APPLE NOTE: Darwin employs size bounded string operation. */
15755                 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15756                         continue;
15757
15758                 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15759                         continue;
15760
15761                 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15762                         continue;
15763
15764                 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15765                     ep)) != 0) {
15766                         /*
15767                          * Adding this helper action failed -- we are now going
15768                          * to rip out the entire generation and return failure.
15769                          */
15770                         (void) dtrace_helper_destroygen(p, help->dthps_generation);
15771                         dtrace_enabling_destroy(enab);
15772                         dtrace_dof_destroy(dof);
15773                         return (-1);
15774                 }
15775
15776                 nhelpers++;
15777         }
15778
15779         if (nhelpers < enab->dten_ndesc)
15780                 dtrace_dof_error(dof, "unmatched helpers");
15781
15782         gen = help->dthps_generation++;
15783         dtrace_enabling_destroy(enab);
15784
15785         if (dhp != NULL && nprovs > 0) {
15786                 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15787                 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
15788                         lck_mtx_unlock(&dtrace_lock);
15789                         dtrace_helper_provider_register(p, help, dhp);
15790                         lck_mtx_lock(&dtrace_lock);
15791
15792                         destroy = 0;
15793                 }
15794         }
15795
15796         if (destroy)
15797                 dtrace_dof_destroy(dof);
15798
15799         return (gen);
15800 }
15801
15802 /*
15803  * APPLE NOTE:  DTrace lazy dof implementation
15804  *
15805  * DTrace user static probes (USDT probes) and helper actions are loaded
15806  * in a process by proccessing dof sections. The dof sections are passed
15807  * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15808  * expensive to process dof for a process that will never use it. There
15809  * is a memory cost (allocating the providers/probes), and a cpu cost
15810  * (creating the providers/probes).
15811  *
15812  * To reduce this cost, we use "lazy dof". The normal proceedure for
15813  * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15814  * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15815  * used, each process retains the dof_ioctl_data_t block, instead of
15816  * copying in the data it points to.
15817  *
15818  * The dof_ioctl_data_t blocks are managed as if they were the actual
15819  * processed dof; on fork the block is copied to the child, on exec and
15820  * exit the block is freed.
15821  *
15822  * If the process loads library(s) containing additional dof, the
15823  * new dof_ioctl_data_t is merged with the existing block.
15824  *
15825  * There are a few catches that make this slightly more difficult.
15826  * When dyld registers dof_ioctl_data_t blocks, it expects a unique
15827  * identifier value for each dof in the block. In non-lazy dof terms,
15828  * this is the generation that dof was loaded in. If we hand back
15829  * a UID for a lazy dof, that same UID must be able to unload the
15830  * dof once it has become non-lazy. To meet this requirement, the
15831  * code that loads lazy dof requires that the UID's for dof(s) in
15832  * the lazy dof be sorted, and in ascending order. It is okay to skip
15833  * UID's, I.E., 1 -> 5 -> 6 is legal.
15834  *
15835  * Once a process has become non-lazy, it will stay non-lazy. All
15836  * future dof operations for that process will be non-lazy, even
15837  * if the dof mode transitions back to lazy.
15838  *
15839  * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15840  * That way if the lazy check fails due to transitioning to non-lazy, the
15841  * right thing is done with the newly faulted in dof.
15842  */
15843
15844 /*
15845  * This method is a bit squicky. It must handle:
15846  *
15847  * dof should not be lazy.
15848  * dof should have been handled lazily, but there was an error
15849  * dof was handled lazily, and needs to be freed.
15850  * dof was handled lazily, and must not be freed.
15851  *
15852  *
15853  * Returns EACCESS if dof should be handled non-lazily.
15854  *
15855  * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
15856  *
15857  * If the dofs data is claimed by this method, dofs_claimed will be set.
15858  * Callers should not free claimed dofs.
15859  */
15860 static int
15861 dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
15862 {
15863         ASSERT(p);
15864         ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
15865
15866         int rval = 0;
15867         *dofs_claimed = 0;
15868
15869         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15870
15871         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15872         ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15873
15874         /*
15875          * Any existing helpers force non-lazy behavior.
15876          */
15877         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15878                 dtrace_sprlock(p);
15879
15880                 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15881                 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
15882                 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
15883
15884                 /*
15885                  * Range check...
15886                  */
15887                 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
15888                         dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
15889                         rval = EINVAL;
15890                         goto unlock;
15891                 }
15892
15893                 /*
15894                  * Each dof being added must be assigned a unique generation.
15895                  */
15896                 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
15897                 for (i=0; i<incoming_dofs->dofiod_count; i++) {
15898                         /*
15899                          * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
15900                          */
15901                         ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
15902                         incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
15903                 }
15904
15905
15906                 if (existing_dofs) {
15907                         /*
15908                          * Merge the existing and incoming dofs
15909                          */
15910                         size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
15911                         dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
15912
15913                         bcopy(&existing_dofs->dofiod_helpers[0],
15914                               &merged_dofs->dofiod_helpers[0],
15915                               sizeof(dof_helper_t) * existing_dofs_count);
15916                         bcopy(&incoming_dofs->dofiod_helpers[0],
15917                               &merged_dofs->dofiod_helpers[existing_dofs_count],
15918                               sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
15919
15920                         merged_dofs->dofiod_count = merged_dofs_count;
15921
15922                         kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15923
15924                         p->p_dtrace_lazy_dofs = merged_dofs;
15925                 } else {
15926                         /*
15927                          * Claim the incoming dofs
15928                          */
15929                         *dofs_claimed = 1;
15930                         p->p_dtrace_lazy_dofs = incoming_dofs;
15931                 }
15932
15933 #if DEBUG
15934                 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15935                 for (i=0; i<all_dofs->dofiod_count-1; i++) {
15936                         ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15937                 }
15938 #endif /* DEBUG */
15939
15940 unlock:
15941                 dtrace_sprunlock(p);
15942         } else {
15943                 rval = EACCES;
15944         }
15945
15946         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15947
15948         return rval;
15949 }
15950
15951 /*
15952  * Returns:
15953  *
15954  * EINVAL: lazy dof is enabled, but the requested generation was not found.
15955  * EACCES: This removal needs to be handled non-lazily.
15956  */
15957 static int
15958 dtrace_lazy_dofs_remove(proc_t *p, int generation)
15959 {
15960         int rval = EINVAL;
15961
15962         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15963
15964         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15965         ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15966
15967         /*
15968          * Any existing helpers force non-lazy behavior.
15969          */
15970         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15971                 dtrace_sprlock(p);
15972
15973                 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15974
15975                 if (existing_dofs) {
15976                         int index, existing_dofs_count = existing_dofs->dofiod_count;
15977                         for (index=0; index<existing_dofs_count; index++) {
15978                                 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
15979                                         dof_ioctl_data_t* removed_dofs = NULL;
15980
15981                                         /*
15982                                          * If there is only 1 dof, we'll delete it and swap in NULL.
15983                                          */
15984                                         if (existing_dofs_count > 1) {
15985                                                 int removed_dofs_count = existing_dofs_count - 1;
15986                                                 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
15987
15988                                                 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
15989                                                 removed_dofs->dofiod_count = removed_dofs_count;
15990
15991                                                 /*
15992                                                  * copy the remaining data.
15993                                                  */
15994                                                 if (index > 0) {
15995                                                         bcopy(&existing_dofs->dofiod_helpers[0],
15996                                                               &removed_dofs->dofiod_helpers[0],
15997                                                               index * sizeof(dof_helper_t));
15998                                                 }
15999
16000                                                 if (index < existing_dofs_count-1) {
16001                                                         bcopy(&existing_dofs->dofiod_helpers[index+1],
16002                                                               &removed_dofs->dofiod_helpers[index],
16003                                                               (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
16004                                                 }
16005                                         }
16006
16007                                         kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
16008
16009                                         p->p_dtrace_lazy_dofs = removed_dofs;
16010
16011                                         rval = KERN_SUCCESS;
16012
16013                                         break;
16014                                 }
16015                         }
16016
16017 #if DEBUG
16018                         dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
16019                         if (all_dofs) {
16020                                 unsigned int i;
16021                                 for (i=0; i<all_dofs->dofiod_count-1; i++) {
16022                                         ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
16023                                 }
16024                         }
16025 #endif
16026
16027                 }
16028                 dtrace_sprunlock(p);
16029         } else {
16030                 rval = EACCES;
16031         }
16032
16033         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16034
16035         return rval;
16036 }
16037
16038 void
16039 dtrace_lazy_dofs_destroy(proc_t *p)
16040 {
16041         lck_rw_lock_shared(&dtrace_dof_mode_lock);
16042         dtrace_sprlock(p);
16043
16044         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16045
16046         dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16047         p->p_dtrace_lazy_dofs = NULL;
16048
16049         dtrace_sprunlock(p);
16050         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16051
16052         if (lazy_dofs) {
16053                 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16054         }
16055 }
16056
16057 static int
16058 dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
16059 {
16060 #pragma unused(ignored)
16061         /*
16062          * Okay to NULL test without taking the sprlock.
16063          */
16064         return p->p_dtrace_lazy_dofs != NULL;
16065 }
16066
16067 static void
16068 dtrace_lazy_dofs_process(proc_t *p) {
16069         /*
16070          * It is possible this process may exit during our attempt to
16071          * fault in the dof. We could fix this by holding locks longer,
16072          * but the errors are benign.
16073          */
16074         dtrace_sprlock(p);
16075
16076
16077         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16078         ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
16079
16080         dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16081         p->p_dtrace_lazy_dofs = NULL;
16082
16083         dtrace_sprunlock(p);
16084         lck_mtx_lock(&dtrace_meta_lock);
16085         /*
16086          * Process each dof_helper_t
16087          */
16088         if (lazy_dofs != NULL) {
16089                 unsigned int i;
16090                 int rval;
16091
16092                 for (i=0; i<lazy_dofs->dofiod_count; i++) {
16093                         /*
16094                          * When loading lazy dof, we depend on the generations being sorted in ascending order.
16095                          */
16096                         ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
16097
16098                         dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
16099
16100                         /*
16101                          * We stored the generation in dofhp_dof. Save it, and restore the original value.
16102                          */
16103                         int generation = dhp->dofhp_dof;
16104                         dhp->dofhp_dof = dhp->dofhp_addr;
16105
16106                         dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
16107
16108                         if (dof != NULL) {
16109                                 dtrace_helpers_t *help;
16110
16111                                 lck_mtx_lock(&dtrace_lock);
16112
16113                                 /*
16114                                  * This must be done with the dtrace_lock held
16115                                  */
16116                                 if ((help = p->p_dtrace_helpers) == NULL)
16117                                         help = dtrace_helpers_create(p);
16118
16119                                 /*
16120                                  * If the generation value has been bumped, someone snuck in
16121                                  * when we released the dtrace lock. We have to dump this generation,
16122                                  * there is no safe way to load it.
16123                                  */
16124                                 if (help->dthps_generation <= generation) {
16125                                         help->dthps_generation = generation;
16126
16127                                         /*
16128                                          * dtrace_helper_slurp() takes responsibility for the dof --
16129                                          * it may free it now or it may save it and free it later.
16130                                          */
16131                                         if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
16132                                                 dtrace_dof_error(NULL, "returned value did not match expected generation");
16133                                         }
16134                                 }
16135
16136                                 lck_mtx_unlock(&dtrace_lock);
16137                         }
16138                 }
16139                 lck_mtx_unlock(&dtrace_meta_lock);
16140                 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16141         } else {
16142                 lck_mtx_unlock(&dtrace_meta_lock);
16143         }
16144 }
16145
16146 static int
16147 dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
16148 {
16149 #pragma unused(ignored)
16150
16151         dtrace_lazy_dofs_process(p);
16152
16153         return PROC_RETURNED;
16154 }
16155
16156 #define DTRACE_LAZY_DOFS_DUPLICATED 1
16157
16158 static int
16159 dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
16160 {
16161         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
16162         LCK_MTX_ASSERT(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16163         LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16164
16165         lck_rw_lock_shared(&dtrace_dof_mode_lock);
16166         dtrace_sprlock(parent);
16167
16168         /*
16169          * We need to make sure that the transition to lazy dofs -> helpers
16170          * was atomic for our parent
16171          */
16172         ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
16173         /*
16174          * In theory we should hold the child sprlock, but this is safe...
16175          */
16176         ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
16177
16178         dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
16179         dof_ioctl_data_t* child_dofs = NULL;
16180         if (parent_dofs) {
16181                 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
16182                 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
16183                 bcopy(parent_dofs, child_dofs, parent_dofs_size);
16184         }
16185
16186         dtrace_sprunlock(parent);
16187
16188         if (child_dofs) {
16189                 dtrace_sprlock(child);
16190                 child->p_dtrace_lazy_dofs = child_dofs;
16191                 dtrace_sprunlock(child);
16192                 /**
16193                  * We process the DOF at this point if the mode is set to
16194                  * LAZY_OFF. This can happen if DTrace is still processing the
16195                  * DOF of other process (which can happen because the
16196                  * protected pager can have a huge latency)
16197                  * but has not processed our parent yet
16198                  */
16199                 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
16200                         dtrace_lazy_dofs_process(child);
16201                 }
16202                 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16203
16204                 return DTRACE_LAZY_DOFS_DUPLICATED;
16205         }
16206         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16207
16208         return 0;
16209 }
16210
16211 static dtrace_helpers_t *
16212 dtrace_helpers_create(proc_t *p)
16213 {
16214         dtrace_helpers_t *help;
16215
16216         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
16217         ASSERT(p->p_dtrace_helpers == NULL);
16218
16219         help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16220         help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16221             DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16222
16223         p->p_dtrace_helpers = help;
16224         dtrace_helpers++;
16225
16226         return (help);
16227 }
16228
16229 static void
16230 dtrace_helpers_destroy(proc_t* p)
16231 {
16232         dtrace_helpers_t *help;
16233         dtrace_vstate_t *vstate;
16234         uint_t i;
16235
16236         lck_mtx_lock(&dtrace_meta_lock);
16237         lck_mtx_lock(&dtrace_lock);
16238
16239         ASSERT(p->p_dtrace_helpers != NULL);
16240         ASSERT(dtrace_helpers > 0);
16241
16242         help = p->p_dtrace_helpers;
16243         vstate = &help->dthps_vstate;
16244
16245         /*
16246          * We're now going to lose the help from this process.
16247          */
16248         p->p_dtrace_helpers = NULL;
16249         dtrace_sync();
16250
16251         /*
16252          * Destory the helper actions.
16253          */
16254         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16255                 dtrace_helper_action_t *h, *next;
16256
16257                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16258                         next = h->dtha_next;
16259                         dtrace_helper_action_destroy(h, vstate);
16260                         h = next;
16261                 }
16262         }
16263
16264         lck_mtx_unlock(&dtrace_lock);
16265
16266         /*
16267          * Destroy the helper providers.
16268          */
16269         if (help->dthps_maxprovs > 0) {
16270                 if (dtrace_meta_pid != NULL) {
16271                         ASSERT(dtrace_deferred_pid == NULL);
16272
16273                         for (i = 0; i < help->dthps_nprovs; i++) {
16274                                 dtrace_helper_provider_remove(
16275                                     &help->dthps_provs[i]->dthp_prov, p);
16276                         }
16277                 } else {
16278                         lck_mtx_lock(&dtrace_lock);
16279                         ASSERT(help->dthps_deferred == 0 ||
16280                             help->dthps_next != NULL ||
16281                             help->dthps_prev != NULL ||
16282                             help == dtrace_deferred_pid);
16283
16284                         /*
16285                          * Remove the helper from the deferred list.
16286                          */
16287                         if (help->dthps_next != NULL)
16288                                 help->dthps_next->dthps_prev = help->dthps_prev;
16289                         if (help->dthps_prev != NULL)
16290                                 help->dthps_prev->dthps_next = help->dthps_next;
16291                         if (dtrace_deferred_pid == help) {
16292                                 dtrace_deferred_pid = help->dthps_next;
16293                                 ASSERT(help->dthps_prev == NULL);
16294                         }
16295
16296                         lck_mtx_unlock(&dtrace_lock);
16297                 }
16298
16299
16300                 for (i = 0; i < help->dthps_nprovs; i++) {
16301                         dtrace_helper_provider_destroy(help->dthps_provs[i]);
16302                 }
16303
16304                 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16305                     sizeof (dtrace_helper_provider_t *));
16306         }
16307
16308         lck_mtx_lock(&dtrace_lock);
16309
16310         dtrace_vstate_fini(&help->dthps_vstate);
16311         kmem_free(help->dthps_actions,
16312             sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16313         kmem_free(help, sizeof (dtrace_helpers_t));
16314
16315         --dtrace_helpers;
16316         lck_mtx_unlock(&dtrace_lock);
16317         lck_mtx_unlock(&dtrace_meta_lock);
16318 }
16319
16320 static void
16321 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16322 {
16323         dtrace_helpers_t *help, *newhelp;
16324         dtrace_helper_action_t *helper, *new, *last;
16325         dtrace_difo_t *dp;
16326         dtrace_vstate_t *vstate;
16327         uint_t i;
16328         int j, sz, hasprovs = 0;
16329
16330         lck_mtx_lock(&dtrace_meta_lock);
16331         lck_mtx_lock(&dtrace_lock);
16332         ASSERT(from->p_dtrace_helpers != NULL);
16333         ASSERT(dtrace_helpers > 0);
16334
16335         help = from->p_dtrace_helpers;
16336         newhelp = dtrace_helpers_create(to);
16337         ASSERT(to->p_dtrace_helpers != NULL);
16338
16339         newhelp->dthps_generation = help->dthps_generation;
16340         vstate = &newhelp->dthps_vstate;
16341
16342         /*
16343          * Duplicate the helper actions.
16344          */
16345         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16346                 if ((helper = help->dthps_actions[i]) == NULL)
16347                         continue;
16348
16349                 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16350                         new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16351                             KM_SLEEP);
16352                         new->dtha_generation = helper->dtha_generation;
16353
16354                         if ((dp = helper->dtha_predicate) != NULL) {
16355                                 dp = dtrace_difo_duplicate(dp, vstate);
16356                                 new->dtha_predicate = dp;
16357                         }
16358
16359                         new->dtha_nactions = helper->dtha_nactions;
16360                         sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16361                         new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16362
16363                         for (j = 0; j < new->dtha_nactions; j++) {
16364                                 dtrace_difo_t *dpj = helper->dtha_actions[j];
16365
16366                                 ASSERT(dpj != NULL);
16367                                 dpj = dtrace_difo_duplicate(dpj, vstate);
16368                                 new->dtha_actions[j] = dpj;
16369                         }
16370
16371                         if (last != NULL) {
16372                                 last->dtha_next = new;
16373                         } else {
16374                                 newhelp->dthps_actions[i] = new;
16375                         }
16376
16377                         last = new;
16378                 }
16379         }
16380
16381         /*
16382          * Duplicate the helper providers and register them with the
16383          * DTrace framework.
16384          */
16385         if (help->dthps_nprovs > 0) {
16386                 newhelp->dthps_nprovs = help->dthps_nprovs;
16387                 newhelp->dthps_maxprovs = help->dthps_nprovs;
16388                 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16389                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16390                 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16391                         newhelp->dthps_provs[i] = help->dthps_provs[i];
16392                         newhelp->dthps_provs[i]->dthp_ref++;
16393                 }
16394
16395                 hasprovs = 1;
16396         }
16397
16398         lck_mtx_unlock(&dtrace_lock);
16399
16400         if (hasprovs)
16401                 dtrace_helper_provider_register(to, newhelp, NULL);
16402
16403         lck_mtx_unlock(&dtrace_meta_lock);
16404 }
16405
16406 /**
16407  * DTrace Process functions
16408  */
16409
16410 void
16411 dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn)
16412 {
16413         /*
16414          * This code applies to new processes who are copying the task
16415          * and thread state and address spaces of their parent process.
16416          */
16417         if (!spawn) {
16418                 /*
16419                  * APPLE NOTE: Solaris does a sprlock() and drops the
16420                  * proc_lock here. We're cheating a bit and only taking
16421                  * the p_dtrace_sprlock lock. A full sprlock would
16422                  * task_suspend the parent.
16423                  */
16424                 dtrace_sprlock(parent_proc);
16425
16426                 /*
16427                  * Remove all DTrace tracepoints from the child process. We
16428                  * need to do this _before_ duplicating USDT providers since
16429                  * any associated probes may be immediately enabled.
16430                  */
16431                 if (parent_proc->p_dtrace_count > 0) {
16432                         dtrace_fasttrap_fork(parent_proc, child_proc);
16433                 }
16434
16435                 dtrace_sprunlock(parent_proc);
16436
16437                 /*
16438                  * Duplicate any lazy dof(s). This must be done while NOT
16439                  * holding the parent sprlock! Lock ordering is
16440                  * dtrace_dof_mode_lock, then sprlock.  It is imperative we
16441                  * always call dtrace_lazy_dofs_duplicate, rather than null
16442                  * check and call if !NULL. If we NULL test, during lazy dof
16443                  * faulting we can race with the faulting code and proceed
16444                  * from here to beyond the helpers copy. The lazy dof
16445                  * faulting will then fail to copy the helpers to the child
16446                  * process. We return if we duplicated lazy dofs as a process
16447                  * can only have one at the same time to avoid a race between
16448                  * a dtrace client and dtrace_proc_fork where a process would
16449                  * end up with both lazy dofs and helpers.
16450                  */
16451                 if (dtrace_lazy_dofs_duplicate(parent_proc, child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) {
16452                         return;
16453                 }
16454
16455                 /*
16456                  * Duplicate any helper actions and providers if they haven't
16457                  * already.
16458                  */
16459 #if !defined(__APPLE__)
16460                  /*
16461                  * The SFORKING
16462                  * we set above informs the code to enable USDT probes that
16463                  * sprlock() may fail because the child is being forked.
16464                  */
16465 #endif
16466                 /*
16467                  * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
16468                  * never fails to find the child. We do not set SFORKING.
16469                  */
16470                 if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
16471                         (*dtrace_helpers_fork)(parent_proc, child_proc);
16472                 }
16473         }
16474 }
16475
16476 void
16477 dtrace_proc_exec(proc_t *p)
16478 {
16479         /*
16480          * Invalidate any predicate evaluation already cached for this thread by DTrace.
16481          * That's because we've just stored to p_comm and DTrace refers to that when it
16482          * evaluates the "execname" special variable. uid and gid may have changed as well.
16483          */
16484         dtrace_set_thread_predcache(current_thread(), 0);
16485
16486         /*
16487          * Free any outstanding lazy dof entries. It is imperative we
16488          * always call dtrace_lazy_dofs_destroy, rather than null check
16489          * and call if !NULL. If we NULL test, during lazy dof faulting
16490          * we can race with the faulting code and proceed from here to
16491          * beyond the helpers cleanup. The lazy dof faulting will then
16492          * install new helpers which no longer belong to this process!
16493          */
16494         dtrace_lazy_dofs_destroy(p);
16495
16496
16497         /*
16498          * Clean up any DTrace helpers for the process.
16499          */
16500         if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
16501                 (*dtrace_helpers_cleanup)(p);
16502         }
16503
16504         /*
16505          * Cleanup the DTrace provider associated with this process.
16506          */
16507         proc_lock(p);
16508         if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
16509                 (*dtrace_fasttrap_exec_ptr)(p);
16510         }
16511         proc_unlock(p);
16512 }
16513
16514 void
16515 dtrace_proc_exit(proc_t *p)
16516 {
16517         /*
16518          * Free any outstanding lazy dof entries. It is imperative we
16519          * always call dtrace_lazy_dofs_destroy, rather than null check
16520          * and call if !NULL. If we NULL test, during lazy dof faulting
16521          * we can race with the faulting code and proceed from here to
16522          * beyond the helpers cleanup. The lazy dof faulting will then
16523          * install new helpers which will never be cleaned up, and leak.
16524          */
16525         dtrace_lazy_dofs_destroy(p);
16526
16527         /*
16528          * Clean up any DTrace helper actions or probes for the process.
16529          */
16530         if (p->p_dtrace_helpers != NULL) {
16531                 (*dtrace_helpers_cleanup)(p);
16532         }
16533
16534         /*
16535          * Clean up any DTrace probes associated with this process.
16536          */
16537         /*
16538          * APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(),
16539          * call this after dtrace_helpers_cleanup()
16540          */
16541         proc_lock(p);
16542         if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) {
16543                 (*dtrace_fasttrap_exit_ptr)(p);
16544         }
16545         proc_unlock(p);
16546 }
16547
16548 /*
16549  * DTrace Hook Functions
16550  */
16551
16552 /*
16553  * APPLE NOTE:  dtrace_modctl_* routines for kext support.
16554  * Used to manipulate the modctl list within dtrace xnu.
16555  */
16556
16557 modctl_t *dtrace_modctl_list;
16558
16559 static void
16560 dtrace_modctl_add(struct modctl * newctl)
16561 {
16562         struct modctl *nextp, *prevp;
16563
16564         ASSERT(newctl != NULL);
16565         LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16566
16567         // Insert new module at the front of the list,
16568
16569         newctl->mod_next = dtrace_modctl_list;
16570         dtrace_modctl_list = newctl;
16571
16572         /*
16573          * If a module exists with the same name, then that module
16574          * must have been unloaded with enabled probes. We will move
16575          * the unloaded module to the new module's stale chain and
16576          * then stop traversing the list.
16577          */
16578
16579         prevp = newctl;
16580         nextp = newctl->mod_next;
16581
16582         while (nextp != NULL) {
16583                 if (nextp->mod_loaded) {
16584                         /* This is a loaded module. Keep traversing. */
16585                         prevp = nextp;
16586                         nextp = nextp->mod_next;
16587                         continue;
16588                 }
16589                 else {
16590                         /* Found an unloaded module */
16591                         if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
16592                                 /* Names don't match. Keep traversing. */
16593                                 prevp = nextp;
16594                                 nextp = nextp->mod_next;
16595                                 continue;
16596                         }
16597                         else {
16598                                 /* We found a stale entry, move it. We're done. */
16599                                 prevp->mod_next = nextp->mod_next;
16600                                 newctl->mod_stale = nextp;
16601                                 nextp->mod_next = NULL;
16602                                 break;
16603                         }
16604                 }
16605         }
16606 }
16607
16608 static modctl_t *
16609 dtrace_modctl_lookup(struct kmod_info * kmod)
16610 {
16611     LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16612
16613     struct modctl * ctl;
16614
16615     for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
16616         if (ctl->mod_id == kmod->id)
16617             return(ctl);
16618     }
16619     return (NULL);
16620 }
16621
16622 /*
16623  * This routine is called from dtrace_module_unloaded().
16624  * It removes a modctl structure and its stale chain
16625  * from the kext shadow list.
16626  */
16627 static void
16628 dtrace_modctl_remove(struct modctl * ctl)
16629 {
16630         ASSERT(ctl != NULL);
16631         LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16632         modctl_t *prevp, *nextp, *curp;
16633
16634         // Remove stale chain first
16635         for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
16636                 nextp = curp->mod_stale;
16637                 /* There should NEVER be user symbols allocated at this point */
16638                 ASSERT(curp->mod_user_symbols == NULL);
16639                 kmem_free(curp, sizeof(modctl_t));
16640         }
16641
16642         prevp = NULL;
16643         curp = dtrace_modctl_list;
16644
16645         while (curp != ctl) {
16646                 prevp = curp;
16647                 curp = curp->mod_next;
16648         }
16649
16650         if (prevp != NULL) {
16651                 prevp->mod_next = ctl->mod_next;
16652         }
16653         else {
16654                 dtrace_modctl_list = ctl->mod_next;
16655         }
16656
16657         /* There should NEVER be user symbols allocated at this point */
16658         ASSERT(ctl->mod_user_symbols == NULL);
16659
16660         kmem_free (ctl, sizeof(modctl_t));
16661 }
16662
16663 /*
16664  * APPLE NOTE: The kext loader will call dtrace_module_loaded
16665  * when the kext is loaded in memory, but before calling the
16666  * kext's start routine.
16667  *
16668  * Return 0 on success
16669  * Return -1 on failure
16670  */
16671
16672 static int
16673 dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16674 {
16675         dtrace_provider_t *prv;
16676
16677         /*
16678          * If kernel symbols have been disabled, return immediately
16679          * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16680          */
16681         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16682                 return 0;
16683
16684         struct modctl *ctl = NULL;
16685         if (!kmod || kmod->address == 0 || kmod->size == 0)
16686                 return(-1);
16687
16688         lck_mtx_lock(&dtrace_provider_lock);
16689         lck_mtx_lock(&mod_lock);
16690
16691         /*
16692          * Have we seen this kext before?
16693          */
16694
16695         ctl = dtrace_modctl_lookup(kmod);
16696
16697         if (ctl != NULL) {
16698                 /* bail... we already have this kext in the modctl list */
16699                 lck_mtx_unlock(&mod_lock);
16700                 lck_mtx_unlock(&dtrace_provider_lock);
16701                 if (dtrace_err_verbose)
16702                         cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16703                 return(-1);
16704         }
16705         else {
16706                 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16707                 if (ctl == NULL) {
16708                         if (dtrace_err_verbose)
16709                                 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16710                         lck_mtx_unlock(&mod_lock);
16711                         lck_mtx_unlock(&dtrace_provider_lock);
16712                         return (-1);
16713                 }
16714                 ctl->mod_next = NULL;
16715                 ctl->mod_stale = NULL;
16716                 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16717                 ctl->mod_loadcnt = kmod->id;
16718                 ctl->mod_nenabled = 0;
16719                 ctl->mod_address  = kmod->address;
16720                 ctl->mod_size = kmod->size;
16721                 ctl->mod_id = kmod->id;
16722                 ctl->mod_loaded = 1;
16723                 ctl->mod_flags = 0;
16724                 ctl->mod_user_symbols = NULL;
16725
16726                 /*
16727                  * Find the UUID for this module, if it has one
16728                  */
16729                 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16730                 struct load_command* load_cmd = (struct load_command *)&header[1];
16731                 uint32_t i;
16732                 for (i = 0; i < header->ncmds; i++) {
16733                         if (load_cmd->cmd == LC_UUID) {
16734                                 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16735                                 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16736                                 ctl->mod_flags |= MODCTL_HAS_UUID;
16737                                 break;
16738                         }
16739                         load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16740                 }
16741
16742                 if (ctl->mod_address == g_kernel_kmod_info.address) {
16743                         ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
16744                         memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid));
16745                 }
16746                 /*
16747                  * Static kexts have a UUID that is not used for symbolication, as all their
16748                  * symbols are in kernel
16749                  */
16750                 else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
16751                         memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid));
16752                         ctl->mod_flags |= MODCTL_IS_STATIC_KEXT;
16753                 }
16754         }
16755         dtrace_modctl_add(ctl);
16756
16757         /*
16758          * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16759          */
16760         lck_mtx_lock(&dtrace_lock);
16761
16762         /*
16763          * DTrace must decide if it will instrument modules lazily via
16764          * userspace symbols (default mode), or instrument immediately via
16765          * kernel symbols (non-default mode)
16766          *
16767          * When in default/lazy mode, DTrace will only support modules
16768          * built with a valid UUID.
16769          *
16770          * Overriding the default can be done explicitly in one of
16771          * the following two ways.
16772          *
16773          * A module can force symbols from kernel space using the plist key,
16774          * OSBundleForceDTraceInit (see kmod.h).  If this per kext state is set,
16775          * we fall through and instrument this module now.
16776          *
16777          * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16778          * from kernel space (see dtrace_impl.h).  If this system state is set
16779          * to a non-userspace mode, we fall through and instrument the module now.
16780          */
16781
16782         if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16783             (!(flag & KMOD_DTRACE_FORCE_INIT)))
16784         {
16785                 /* We will instrument the module lazily -- this is the default */
16786                 lck_mtx_unlock(&dtrace_lock);
16787                 lck_mtx_unlock(&mod_lock);
16788                 lck_mtx_unlock(&dtrace_provider_lock);
16789                 return 0;
16790         }
16791
16792         /* We will instrument the module immediately using kernel symbols */
16793         ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
16794
16795         lck_mtx_unlock(&dtrace_lock);
16796
16797         /*
16798          * We're going to call each providers per-module provide operation
16799          * specifying only this module.
16800          */
16801         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16802                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16803
16804         /*
16805          * APPLE NOTE: The contract with the kext loader is that once this function
16806          * has completed, it may delete kernel symbols at will.
16807          * We must set this while still holding the mod_lock.
16808          */
16809         ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16810
16811         lck_mtx_unlock(&mod_lock);
16812         lck_mtx_unlock(&dtrace_provider_lock);
16813
16814         /*
16815          * If we have any retained enablings, we need to match against them.
16816          * Enabling probes requires that cpu_lock be held, and we cannot hold
16817          * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16818          * module.  (In particular, this happens when loading scheduling
16819          * classes.)  So if we have any retained enablings, we need to dispatch
16820          * our task queue to do the match for us.
16821          */
16822         lck_mtx_lock(&dtrace_lock);
16823
16824         if (dtrace_retained == NULL) {
16825                 lck_mtx_unlock(&dtrace_lock);
16826                 return 0;
16827         }
16828
16829         /* APPLE NOTE!
16830          *
16831          * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16832          * holds it for any reason. Thus the comment above is invalid, we can directly invoke
16833          * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16834          * the delay call as well.
16835          */
16836         lck_mtx_unlock(&dtrace_lock);
16837
16838         dtrace_enabling_matchall();
16839
16840         return 0;
16841 }
16842
16843 /*
16844  * Return 0 on success
16845  * Return -1 on failure
16846  */
16847 static int
16848 dtrace_module_unloaded(struct kmod_info *kmod)
16849 {
16850         dtrace_probe_t template, *probe, *first, *next;
16851         dtrace_provider_t *prov;
16852         struct modctl *ctl = NULL;
16853         struct modctl *syncctl = NULL;
16854         struct modctl *nextsyncctl = NULL;
16855         int syncmode = 0;
16856
16857         lck_mtx_lock(&dtrace_provider_lock);
16858         lck_mtx_lock(&mod_lock);
16859         lck_mtx_lock(&dtrace_lock);
16860
16861         if (kmod == NULL) {
16862             syncmode = 1;
16863         }
16864         else {
16865             ctl = dtrace_modctl_lookup(kmod);
16866             if (ctl == NULL)
16867             {
16868                 lck_mtx_unlock(&dtrace_lock);
16869                 lck_mtx_unlock(&mod_lock);
16870                 lck_mtx_unlock(&dtrace_provider_lock);
16871                 return (-1);
16872             }
16873             ctl->mod_loaded = 0;
16874             ctl->mod_address = 0;
16875             ctl->mod_size = 0;
16876         }
16877
16878         if (dtrace_bymod == NULL) {
16879                 /*
16880                  * The DTrace module is loaded (obviously) but not attached;
16881                  * we don't have any work to do.
16882                  */
16883                  if (ctl != NULL)
16884                          (void)dtrace_modctl_remove(ctl);
16885                  lck_mtx_unlock(&dtrace_lock);
16886                  lck_mtx_unlock(&mod_lock);
16887                  lck_mtx_unlock(&dtrace_provider_lock);
16888                  return(0);
16889         }
16890
16891         /* Syncmode set means we target and traverse entire modctl list. */
16892         if (syncmode)
16893             nextsyncctl = dtrace_modctl_list;
16894
16895 syncloop:
16896         if (syncmode)
16897         {
16898             /* find a stale modctl struct */
16899             for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
16900                 if (syncctl->mod_address == 0)
16901                     break;
16902             }
16903             if (syncctl==NULL)
16904             {
16905                 /* We have no more work to do */
16906                 lck_mtx_unlock(&dtrace_lock);
16907                 lck_mtx_unlock(&mod_lock);
16908                 lck_mtx_unlock(&dtrace_provider_lock);
16909                 return(0);
16910             }
16911             else {
16912                 /* keep track of next syncctl in case this one is removed */
16913                 nextsyncctl = syncctl->mod_next;
16914                 ctl = syncctl;
16915             }
16916         }
16917
16918         template.dtpr_mod = ctl->mod_modname;
16919
16920         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16921             probe != NULL; probe = probe->dtpr_nextmod) {
16922                 if (probe->dtpr_ecb != NULL) {
16923                         /*
16924                          * This shouldn't _actually_ be possible -- we're
16925                          * unloading a module that has an enabled probe in it.
16926                          * (It's normally up to the provider to make sure that
16927                          * this can't happen.)  However, because dtps_enable()
16928                          * doesn't have a failure mode, there can be an
16929                          * enable/unload race.  Upshot:  we don't want to
16930                          * assert, but we're not going to disable the
16931                          * probe, either.
16932                          */
16933
16934
16935                         if (syncmode) {
16936                             /* We're syncing, let's look at next in list */
16937                             goto syncloop;
16938                         }
16939
16940                         lck_mtx_unlock(&dtrace_lock);
16941                         lck_mtx_unlock(&mod_lock);
16942                         lck_mtx_unlock(&dtrace_provider_lock);
16943
16944                         if (dtrace_err_verbose) {
16945                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16946                                     "enabled probes", ctl->mod_modname);
16947                         }
16948                         return(-1);
16949                 }
16950         }
16951
16952         probe = first;
16953
16954         for (first = NULL; probe != NULL; probe = next) {
16955                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16956
16957                 dtrace_probes[probe->dtpr_id - 1] = NULL;
16958                 probe->dtpr_provider->dtpv_probe_count--;
16959
16960                 next = probe->dtpr_nextmod;
16961                 dtrace_hash_remove(dtrace_byprov, probe);
16962                 dtrace_hash_remove(dtrace_bymod, probe);
16963                 dtrace_hash_remove(dtrace_byfunc, probe);
16964                 dtrace_hash_remove(dtrace_byname, probe);
16965
16966                 if (first == NULL) {
16967                         first = probe;
16968                         probe->dtpr_nextmod = NULL;
16969                 } else {
16970                         probe->dtpr_nextmod = first;
16971                         first = probe;
16972                 }
16973         }
16974
16975         /*
16976          * We've removed all of the module's probes from the hash chains and
16977          * from the probe array.  Now issue a dtrace_sync() to be sure that
16978          * everyone has cleared out from any probe array processing.
16979          */
16980         dtrace_sync();
16981
16982         for (probe = first; probe != NULL; probe = first) {
16983                 first = probe->dtpr_nextmod;
16984                 prov = probe->dtpr_provider;
16985                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16986                     probe->dtpr_arg);
16987                 dtrace_strunref(probe->dtpr_mod);
16988                 dtrace_strunref(probe->dtpr_func);
16989                 dtrace_strunref(probe->dtpr_name);
16990                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16991
16992                 zfree(dtrace_probe_t_zone, probe);
16993         }
16994
16995         dtrace_modctl_remove(ctl);
16996
16997         if (syncmode)
16998             goto syncloop;
16999
17000         lck_mtx_unlock(&dtrace_lock);
17001         lck_mtx_unlock(&mod_lock);
17002         lck_mtx_unlock(&dtrace_provider_lock);
17003
17004         return(0);
17005 }
17006
17007 void
17008 dtrace_suspend(void)
17009 {
17010         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
17011 }
17012
17013 void
17014 dtrace_resume(void)
17015 {
17016         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
17017 }
17018
17019 static int
17020 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
17021 {
17022         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17023         lck_mtx_lock(&dtrace_lock);
17024
17025         switch (what) {
17026         case CPU_CONFIG: {
17027                 dtrace_state_t *state;
17028                 dtrace_optval_t *opt, rs, c;
17029
17030                 /*
17031                  * For now, we only allocate a new buffer for anonymous state.
17032                  */
17033                 if ((state = dtrace_anon.dta_state) == NULL)
17034                         break;
17035
17036                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
17037                         break;
17038
17039                 opt = state->dts_options;
17040                 c = opt[DTRACEOPT_CPU];
17041
17042                 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
17043                         break;
17044
17045                 /*
17046                  * Regardless of what the actual policy is, we're going to
17047                  * temporarily set our resize policy to be manual.  We're
17048                  * also going to temporarily set our CPU option to denote
17049                  * the newly configured CPU.
17050                  */
17051                 rs = opt[DTRACEOPT_BUFRESIZE];
17052                 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
17053                 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
17054
17055                 (void) dtrace_state_buffers(state);
17056
17057                 opt[DTRACEOPT_BUFRESIZE] = rs;
17058                 opt[DTRACEOPT_CPU] = c;
17059
17060                 break;
17061         }
17062
17063         case CPU_UNCONFIG:
17064                 /*
17065                  * We don't free the buffer in the CPU_UNCONFIG case.  (The
17066                  * buffer will be freed when the consumer exits.)
17067                  */
17068                 break;
17069
17070         default:
17071                 break;
17072         }
17073
17074         lck_mtx_unlock(&dtrace_lock);
17075         return (0);
17076 }
17077
17078 static void
17079 dtrace_cpu_setup_initial(processorid_t cpu)
17080 {
17081         (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
17082 }
17083
17084 static void
17085 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
17086 {
17087         if (dtrace_toxranges >= dtrace_toxranges_max) {
17088                 int osize, nsize;
17089                 dtrace_toxrange_t *range;
17090
17091                 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17092
17093                 if (osize == 0) {
17094                         ASSERT(dtrace_toxrange == NULL);
17095                         ASSERT(dtrace_toxranges_max == 0);
17096                         dtrace_toxranges_max = 1;
17097                 } else {
17098                         dtrace_toxranges_max <<= 1;
17099                 }
17100
17101                 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17102                 range = kmem_zalloc(nsize, KM_SLEEP);
17103
17104                 if (dtrace_toxrange != NULL) {
17105                         ASSERT(osize != 0);
17106                         bcopy(dtrace_toxrange, range, osize);
17107                         kmem_free(dtrace_toxrange, osize);
17108                 }
17109
17110                 dtrace_toxrange = range;
17111         }
17112
17113         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
17114         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
17115
17116         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
17117         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
17118         dtrace_toxranges++;
17119 }
17120
17121 /*
17122  * DTrace Driver Cookbook Functions
17123  */
17124 /*ARGSUSED*/
17125 static int
17126 dtrace_attach(dev_info_t *devi)
17127 {
17128         dtrace_provider_id_t id;
17129         dtrace_state_t *state = NULL;
17130         dtrace_enabling_t *enab;
17131
17132         lck_mtx_lock(&cpu_lock);
17133         lck_mtx_lock(&dtrace_provider_lock);
17134         lck_mtx_lock(&dtrace_lock);
17135
17136         /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
17137         dtrace_devi = devi;
17138
17139         dtrace_modload = dtrace_module_loaded;
17140         dtrace_modunload = dtrace_module_unloaded;
17141         dtrace_cpu_init = dtrace_cpu_setup_initial;
17142         dtrace_helpers_cleanup = dtrace_helpers_destroy;
17143         dtrace_helpers_fork = dtrace_helpers_duplicate;
17144         dtrace_cpustart_init = dtrace_suspend;
17145         dtrace_cpustart_fini = dtrace_resume;
17146         dtrace_debugger_init = dtrace_suspend;
17147         dtrace_debugger_fini = dtrace_resume;
17148
17149         register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17150
17151         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17152
17153         dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
17154             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
17155
17156         dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
17157             sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
17158             NULL, NULL, NULL, NULL, NULL, 0);
17159
17160         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17161
17162         dtrace_nprobes = dtrace_nprobes_default;
17163         dtrace_probes = kmem_zalloc(sizeof(dtrace_probe_t*) * dtrace_nprobes,
17164             KM_SLEEP);
17165
17166         dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
17167             0, /* unused */
17168             offsetof(dtrace_probe_t, dtpr_nextprov),
17169             offsetof(dtrace_probe_t, dtpr_prevprov));
17170
17171         dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset,
17172             offsetof(dtrace_probe_t, dtpr_mod),
17173             offsetof(dtrace_probe_t, dtpr_nextmod),
17174             offsetof(dtrace_probe_t, dtpr_prevmod));
17175
17176         dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset,
17177             offsetof(dtrace_probe_t, dtpr_func),
17178             offsetof(dtrace_probe_t, dtpr_nextfunc),
17179             offsetof(dtrace_probe_t, dtpr_prevfunc));
17180
17181         dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset,
17182             offsetof(dtrace_probe_t, dtpr_name),
17183             offsetof(dtrace_probe_t, dtpr_nextname),
17184             offsetof(dtrace_probe_t, dtpr_prevname));
17185
17186         if (dtrace_retain_max < 1) {
17187                 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
17188                     "setting to 1", dtrace_retain_max);
17189                 dtrace_retain_max = 1;
17190         }
17191
17192         /*
17193          * Now discover our toxic ranges.
17194          */
17195         dtrace_toxic_ranges(dtrace_toxrange_add);
17196
17197         /*
17198          * Before we register ourselves as a provider to our own framework,
17199          * we would like to assert that dtrace_provider is NULL -- but that's
17200          * not true if we were loaded as a dependency of a DTrace provider.
17201          * Once we've registered, we can assert that dtrace_provider is our
17202          * pseudo provider.
17203          */
17204         (void) dtrace_register("dtrace", &dtrace_provider_attr,
17205             DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
17206
17207         ASSERT(dtrace_provider != NULL);
17208         ASSERT((dtrace_provider_id_t)dtrace_provider == id);
17209
17210 #if defined (__x86_64__)
17211         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17212             dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
17213         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17214             dtrace_provider, NULL, NULL, "END", 0, NULL);
17215         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17216             dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
17217 #elif (defined(__arm__) || defined(__arm64__))
17218         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17219             dtrace_provider, NULL, NULL, "BEGIN", 2, NULL);
17220         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17221             dtrace_provider, NULL, NULL, "END", 1, NULL);
17222         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17223             dtrace_provider, NULL, NULL, "ERROR", 4, NULL);
17224 #else
17225 #error Unknown Architecture
17226 #endif
17227
17228         dtrace_anon_property();
17229         lck_mtx_unlock(&cpu_lock);
17230
17231         /*
17232          * If DTrace helper tracing is enabled, we need to allocate the
17233          * trace buffer and initialize the values.
17234          */
17235         if (dtrace_helptrace_enabled) {
17236                 ASSERT(dtrace_helptrace_buffer == NULL);
17237                 dtrace_helptrace_buffer =
17238                     kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
17239                 dtrace_helptrace_next = 0;
17240         }
17241
17242         /*
17243          * If there are already providers, we must ask them to provide their
17244          * probes, and then match any anonymous enabling against them.  Note
17245          * that there should be no other retained enablings at this time:
17246          * the only retained enablings at this time should be the anonymous
17247          * enabling.
17248          */
17249         if (dtrace_anon.dta_enabling != NULL) {
17250                 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
17251
17252                 /*
17253                  * APPLE NOTE: if handling anonymous dof, switch symbol modes.
17254                  */
17255                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17256                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17257                 }
17258
17259                 dtrace_enabling_provide(NULL);
17260                 state = dtrace_anon.dta_state;
17261
17262                 /*
17263                  * We couldn't hold cpu_lock across the above call to
17264                  * dtrace_enabling_provide(), but we must hold it to actually
17265                  * enable the probes.  We have to drop all of our locks, pick
17266                  * up cpu_lock, and regain our locks before matching the
17267                  * retained anonymous enabling.
17268                  */
17269                 lck_mtx_unlock(&dtrace_lock);
17270                 lck_mtx_unlock(&dtrace_provider_lock);
17271
17272                 lck_mtx_lock(&cpu_lock);
17273                 lck_mtx_lock(&dtrace_provider_lock);
17274                 lck_mtx_lock(&dtrace_lock);
17275
17276                 if ((enab = dtrace_anon.dta_enabling) != NULL)
17277                         (void) dtrace_enabling_match(enab, NULL, NULL);
17278
17279                 lck_mtx_unlock(&cpu_lock);
17280         }
17281
17282         lck_mtx_unlock(&dtrace_lock);
17283         lck_mtx_unlock(&dtrace_provider_lock);
17284
17285         if (state != NULL) {
17286                 /*
17287                  * If we created any anonymous state, set it going now.
17288                  */
17289                 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
17290         }
17291
17292         return (DDI_SUCCESS);
17293 }
17294
17295 /*ARGSUSED*/
17296 static int
17297 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
17298 {
17299 #pragma unused(flag, otyp)
17300         dtrace_state_t *state;
17301         uint32_t priv;
17302         uid_t uid;
17303         zoneid_t zoneid;
17304         int rv;
17305
17306         /* APPLE: Darwin puts Helper on its own major device. */
17307
17308         /*
17309          * If no DTRACE_PRIV_* bits are set in the credential, then the
17310          * caller lacks sufficient permission to do anything with DTrace.
17311          */
17312         dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
17313         if (priv == DTRACE_PRIV_NONE)
17314                 return (EACCES);
17315
17316         /*
17317          * APPLE NOTE: We delay the initialization of fasttrap as late as possible.
17318          * It certainly can't be later than now!
17319          */
17320         fasttrap_init();
17321
17322         /*
17323          * Ask all providers to provide all their probes.
17324          */
17325         lck_mtx_lock(&dtrace_provider_lock);
17326         dtrace_probe_provide(NULL, NULL);
17327         lck_mtx_unlock(&dtrace_provider_lock);
17328
17329         lck_mtx_lock(&cpu_lock);
17330         lck_mtx_lock(&dtrace_lock);
17331         dtrace_opens++;
17332         dtrace_membar_producer();
17333
17334 #ifdef illumos
17335         /*
17336          * If the kernel debugger is active (that is, if the kernel debugger
17337          * modified text in some way), we won't allow the open.
17338          */
17339         if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
17340                 dtrace_opens--;
17341                 lck_mtx_unlock(&dtrace_lock);
17342                 lck_mtx_unlock(&cpu_lock);
17343                 return (EBUSY);
17344         }
17345 #endif
17346
17347         rv = dtrace_state_create(devp, cred_p, &state);
17348         lck_mtx_unlock(&cpu_lock);
17349
17350         if (rv != 0 || state == NULL) {
17351                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17352 #ifdef illumos
17353                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17354 #endif
17355                 }
17356                 lck_mtx_unlock(&dtrace_lock);
17357                 /* propagate EAGAIN or ERESTART */
17358                 return (rv);
17359         }
17360
17361         lck_mtx_unlock(&dtrace_lock);
17362
17363         lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17364
17365         /*
17366          * If we are currently lazy, transition states.
17367          *
17368          * Unlike dtrace_close, we do not need to check the
17369          * value of dtrace_opens, as any positive value (and
17370          * we count as 1) means we transition states.
17371          */
17372         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
17373                 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
17374                 /*
17375                  * We do not need to hold the exclusive lock while processing
17376                  * DOF on processes. We do need to make sure the mode does not get
17377                  * changed to DTRACE_DOF_MODE_LAZY_ON during that stage though
17378                  * (which should not happen anyway since it only happens in
17379                  * dtrace_close). There is no way imcomplete USDT probes can be
17380                  * activate by any DTrace clients here since they all have to
17381                  * call dtrace_open and be blocked on dtrace_dof_mode_lock
17382                  */
17383                 lck_rw_lock_exclusive_to_shared(&dtrace_dof_mode_lock);
17384                 /*
17385                  * Iterate all existing processes and load lazy dofs.
17386                  */
17387                 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
17388                              dtrace_lazy_dofs_proc_iterate_doit,
17389                              NULL,
17390                              dtrace_lazy_dofs_proc_iterate_filter,
17391                              NULL);
17392
17393                 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
17394         }
17395         else {
17396                 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17397         }
17398
17399
17400         /*
17401          * Update kernel symbol state.
17402          *
17403          * We must own the provider and dtrace locks.
17404          *
17405          * NOTE! It may appear there is a race by setting this value so late
17406          * after dtrace_probe_provide. However, any kext loaded after the
17407          * call to probe provide and before we set LAZY_OFF will be marked as
17408          * eligible for symbols from userspace. The same dtrace that is currently
17409          * calling dtrace_open() (this call!) will get a list of kexts needing
17410          * symbols and fill them in, thus closing the race window.
17411          *
17412          * We want to set this value only after it certain it will succeed, as
17413          * this significantly reduces the complexity of error exits.
17414          */
17415         lck_mtx_lock(&dtrace_lock);
17416         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17417                 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17418         }
17419         lck_mtx_unlock(&dtrace_lock);
17420
17421         return (0);
17422 }
17423
17424 /*ARGSUSED*/
17425 static int
17426 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17427 {
17428 #pragma unused(flag, otyp, cred_p) /* __APPLE__ */
17429         minor_t minor = getminor(dev);
17430         dtrace_state_t *state;
17431
17432         /* APPLE NOTE: Darwin puts Helper on its own major device. */
17433         state = dtrace_state_get(minor);
17434
17435         lck_mtx_lock(&cpu_lock);
17436         lck_mtx_lock(&dtrace_lock);
17437
17438         if (state->dts_anon) {
17439                 /*
17440                  * There is anonymous state. Destroy that first.
17441                  */
17442                 ASSERT(dtrace_anon.dta_state == NULL);
17443                 dtrace_state_destroy(state->dts_anon);
17444         }
17445
17446         dtrace_state_destroy(state);
17447         ASSERT(dtrace_opens > 0);
17448
17449         /*
17450          * Only relinquish control of the kernel debugger interface when there
17451          * are no consumers and no anonymous enablings.
17452          */
17453         if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17454 #ifdef illumos
17455                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17456 #endif
17457         }
17458
17459         lck_mtx_unlock(&dtrace_lock);
17460         lck_mtx_unlock(&cpu_lock);
17461
17462         /*
17463          * Lock ordering requires the dof mode lock be taken before
17464          * the dtrace_lock.
17465          */
17466         lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17467         lck_mtx_lock(&dtrace_lock);
17468
17469         if (dtrace_opens == 0) {
17470                 /*
17471                  * If we are currently lazy-off, and this is the last close, transition to
17472                  * lazy state.
17473                  */
17474                 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
17475                         dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17476                 }
17477
17478                 /*
17479                  * If we are the last dtrace client, switch back to lazy (from userspace) symbols
17480                  */
17481                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
17482                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17483                 }
17484         }
17485
17486         lck_mtx_unlock(&dtrace_lock);
17487         lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17488
17489         /*
17490          * Kext probes may be retained past the end of the kext's lifespan. The
17491          * probes are kept until the last reference to them has been removed.
17492          * Since closing an active dtrace context is likely to drop that last reference,
17493          * lets take a shot at cleaning out the orphaned probes now.
17494          */
17495         dtrace_module_unloaded(NULL);
17496
17497         return (0);
17498 }
17499
17500 /*ARGSUSED*/
17501 static int
17502 dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
17503 {
17504 #pragma unused(rv)
17505         /*
17506          * Safe to check this outside the dof mode lock
17507          */
17508         if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
17509                 return KERN_SUCCESS;
17510
17511         switch (cmd) {
17512 #if defined (__arm64__)
17513         case DTRACEHIOC_ADDDOF_U32:
17514         case DTRACEHIOC_ADDDOF_U64:
17515 #else
17516         case DTRACEHIOC_ADDDOF:
17517 #endif /* __arm64__*/
17518                         {
17519                         dof_helper_t *dhp = NULL;
17520                         size_t dof_ioctl_data_size;
17521                         dof_ioctl_data_t* multi_dof;
17522                         unsigned int i;
17523                         int rval = 0;
17524                         user_addr_t user_address = *(user_addr_t*)arg;
17525                         uint64_t dof_count;
17526                         int multi_dof_claimed = 0;
17527                         proc_t* p = current_proc();
17528
17529                         /*
17530                          * If this is a restricted process and dtrace is restricted,
17531                          * do not allow DOFs to be registered
17532                          */
17533                         if (dtrace_is_restricted() &&
17534                                 !dtrace_are_restrictions_relaxed() &&
17535                                 !dtrace_can_attach_to_proc(current_proc())) {
17536                                 return (EACCES);
17537                         }
17538
17539                         /*
17540                          * Read the number of DOF sections being passed in.
17541                          */
17542                         if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
17543                                    &dof_count,
17544                                    sizeof(dof_count))) {
17545                                 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
17546                                 return (EFAULT);
17547                         }
17548
17549                         /*
17550                          * Range check the count.
17551                          */
17552                         if (dof_count == 0 || dof_count > 1024) {
17553                                 dtrace_dof_error(NULL, "dofiod_count is not valid");
17554                                 return (EINVAL);
17555                         }
17556
17557                         /*
17558                          * Allocate a correctly sized structure and copyin the data.
17559                          */
17560                         dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
17561                         if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
17562                                 return (ENOMEM);
17563
17564                         /* NOTE! We can no longer exit this method via return */
17565                         if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
17566                                 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
17567                                 rval = EFAULT;
17568                                 goto cleanup;
17569                         }
17570
17571                         /*
17572                          * Check that the count didn't change between the first copyin and the second.
17573                          */
17574                         if (multi_dof->dofiod_count != dof_count) {
17575                                 rval = EINVAL;
17576                                 goto cleanup;
17577                         }
17578
17579                         /*
17580                          * Try to process lazily first.
17581                          */
17582                         rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
17583
17584                         /*
17585                          * If rval is EACCES, we must be non-lazy.
17586                          */
17587                         if (rval == EACCES) {
17588                                 rval = 0;
17589                                 /*
17590                                  * Process each dof_helper_t
17591                                  */
17592                                 i = 0;
17593                                 do {
17594                                         dhp = &multi_dof->dofiod_helpers[i];
17595
17596                                         dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
17597
17598                                         if (dof != NULL) {
17599                                                 lck_mtx_lock(&dtrace_meta_lock);
17600                                                 lck_mtx_lock(&dtrace_lock);
17601
17602                                                 /*
17603                                                  * dtrace_helper_slurp() takes responsibility for the dof --
17604                                                  * it may free it now or it may save it and free it later.
17605                                                  */
17606                                                 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
17607                                                         rval = EINVAL;
17608                                                 }
17609
17610                                                 lck_mtx_unlock(&dtrace_lock);
17611                                                 lck_mtx_unlock(&dtrace_meta_lock);
17612                                         }
17613                                 } while (++i < multi_dof->dofiod_count && rval == 0);
17614                         }
17615
17616                         /*
17617                          * We need to copyout the multi_dof struct, because it contains
17618                          * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
17619                          *
17620                          * This could certainly be better optimized.
17621                          */
17622                         if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
17623                                 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
17624                                 /* Don't overwrite pre-existing error code */
17625                                 if (rval == 0) rval = EFAULT;
17626                         }
17627
17628                 cleanup:
17629                         /*
17630                          * If we had to allocate struct memory, free it.
17631                          */
17632                         if (multi_dof != NULL && !multi_dof_claimed) {
17633                                 kmem_free(multi_dof, dof_ioctl_data_size);
17634                         }
17635
17636                         return rval;
17637                 }
17638
17639                 case DTRACEHIOC_REMOVE: {
17640                         int generation = *(int*)arg;
17641                         proc_t* p = current_proc();
17642
17643                         /*
17644                          * Try lazy first.
17645                          */
17646                         int rval = dtrace_lazy_dofs_remove(p, generation);
17647
17648                         /*
17649                          * EACCES means non-lazy
17650                          */
17651                         if (rval == EACCES) {
17652                                 lck_mtx_lock(&dtrace_meta_lock);
17653                                 lck_mtx_lock(&dtrace_lock);
17654                                 rval = dtrace_helper_destroygen(p, generation);
17655                                 lck_mtx_unlock(&dtrace_lock);
17656                                 lck_mtx_unlock(&dtrace_meta_lock);
17657                         }
17658
17659                         return (rval);
17660                 }
17661
17662                 default:
17663                         break;
17664         }
17665
17666         return ENOTTY;
17667 }
17668
17669 /*ARGSUSED*/
17670 static int
17671 dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
17672 {
17673 #pragma unused(md)
17674         minor_t minor = getminor(dev);
17675         dtrace_state_t *state;
17676         int rval;
17677
17678         /* Darwin puts Helper on its own major device. */
17679
17680         state = dtrace_state_get(minor);
17681
17682         if (state->dts_anon) {
17683            ASSERT(dtrace_anon.dta_state == NULL);
17684            state = state->dts_anon;
17685         }
17686
17687         switch (cmd) {
17688         case DTRACEIOC_PROVIDER: {
17689                 dtrace_providerdesc_t pvd;
17690                 dtrace_provider_t *pvp;
17691
17692                 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
17693                         return (EFAULT);
17694
17695                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17696                 lck_mtx_lock(&dtrace_provider_lock);
17697
17698                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17699                         if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
17700                                 break;
17701                 }
17702
17703                 lck_mtx_unlock(&dtrace_provider_lock);
17704
17705                 if (pvp == NULL)
17706                         return (ESRCH);
17707
17708                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17709                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17710                 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
17711                         return (EFAULT);
17712
17713                 return (0);
17714         }
17715
17716         case DTRACEIOC_EPROBE: {
17717                 dtrace_eprobedesc_t epdesc;
17718                 dtrace_ecb_t *ecb;
17719                 dtrace_action_t *act;
17720                 void *buf;
17721                 size_t size;
17722                 uintptr_t dest;
17723                 int nrecs;
17724
17725                 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
17726                         return (EFAULT);
17727
17728                 lck_mtx_lock(&dtrace_lock);
17729
17730                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17731                         lck_mtx_unlock(&dtrace_lock);
17732                         return (EINVAL);
17733                 }
17734
17735                 if (ecb->dte_probe == NULL) {
17736                         lck_mtx_unlock(&dtrace_lock);
17737                         return (EINVAL);
17738                 }
17739
17740                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17741                 epdesc.dtepd_uarg = ecb->dte_uarg;
17742                 epdesc.dtepd_size = ecb->dte_size;
17743
17744                 nrecs = epdesc.dtepd_nrecs;
17745                 epdesc.dtepd_nrecs = 0;
17746                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17747                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17748                                 continue;
17749
17750                         epdesc.dtepd_nrecs++;
17751                 }
17752
17753                 /*
17754                  * Now that we have the size, we need to allocate a temporary
17755                  * buffer in which to store the complete description.  We need
17756                  * the temporary buffer to be able to drop dtrace_lock()
17757                  * across the copyout(), below.
17758                  */
17759                 size = sizeof (dtrace_eprobedesc_t) +
17760                         (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17761
17762                 buf = kmem_alloc(size, KM_SLEEP);
17763                 dest = (uintptr_t)buf;
17764
17765                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17766                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17767
17768                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17769                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17770                                 continue;
17771
17772                         if (nrecs-- == 0)
17773                                 break;
17774
17775                         bcopy(&act->dta_rec, (void *)dest,
17776                         sizeof (dtrace_recdesc_t));
17777                         dest += sizeof (dtrace_recdesc_t);
17778                 }
17779
17780                 lck_mtx_unlock(&dtrace_lock);
17781
17782                 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17783                         kmem_free(buf, size);
17784                         return (EFAULT);
17785                 }
17786
17787                 kmem_free(buf, size);
17788                 return (0);
17789         }
17790
17791         case DTRACEIOC_AGGDESC: {
17792                 dtrace_aggdesc_t aggdesc;
17793                 dtrace_action_t *act;
17794                 dtrace_aggregation_t *agg;
17795                 int nrecs;
17796                 uint32_t offs;
17797                 dtrace_recdesc_t *lrec;
17798                 void *buf;
17799                 size_t size;
17800                 uintptr_t dest;
17801
17802                 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
17803                         return (EFAULT);
17804
17805                 lck_mtx_lock(&dtrace_lock);
17806
17807                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17808                         lck_mtx_unlock(&dtrace_lock);
17809                         return (EINVAL);
17810                 }
17811
17812                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17813
17814                 nrecs = aggdesc.dtagd_nrecs;
17815                 aggdesc.dtagd_nrecs = 0;
17816
17817                 offs = agg->dtag_base;
17818                 lrec = &agg->dtag_action.dta_rec;
17819                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17820
17821                 for (act = agg->dtag_first; ; act = act->dta_next) {
17822                         ASSERT(act->dta_intuple ||
17823                         DTRACEACT_ISAGG(act->dta_kind));
17824
17825                         /*
17826                          * If this action has a record size of zero, it
17827                          * denotes an argument to the aggregating action.
17828                          * Because the presence of this record doesn't (or
17829                          * shouldn't) affect the way the data is interpreted,
17830                          * we don't copy it out to save user-level the
17831                          * confusion of dealing with a zero-length record.
17832                          */
17833                         if (act->dta_rec.dtrd_size == 0) {
17834                                 ASSERT(agg->dtag_hasarg);
17835                                 continue;
17836                         }
17837
17838                         aggdesc.dtagd_nrecs++;
17839
17840                         if (act == &agg->dtag_action)
17841                                 break;
17842                 }
17843
17844                 /*
17845                  * Now that we have the size, we need to allocate a temporary
17846                  * buffer in which to store the complete description.  We need
17847                  * the temporary buffer to be able to drop dtrace_lock()
17848                  * across the copyout(), below.
17849                  */
17850                 size = sizeof (dtrace_aggdesc_t) +
17851                         (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17852
17853                 buf = kmem_alloc(size, KM_SLEEP);
17854                 dest = (uintptr_t)buf;
17855
17856                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17857                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17858
17859                 for (act = agg->dtag_first; ; act = act->dta_next) {
17860                         dtrace_recdesc_t rec = act->dta_rec;
17861
17862                         /*
17863                          * See the comment in the above loop for why we pass
17864                          * over zero-length records.
17865                          */
17866                         if (rec.dtrd_size == 0) {
17867                                 ASSERT(agg->dtag_hasarg);
17868                                 continue;
17869                         }
17870
17871                         if (nrecs-- == 0)
17872                                 break;
17873
17874                         rec.dtrd_offset -= offs;
17875                         bcopy(&rec, (void *)dest, sizeof (rec));
17876                         dest += sizeof (dtrace_recdesc_t);
17877
17878                         if (act == &agg->dtag_action)
17879                                 break;
17880                 }
17881
17882                 lck_mtx_unlock(&dtrace_lock);
17883
17884                 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17885                         kmem_free(buf, size);
17886                         return (EFAULT);
17887                 }
17888
17889                 kmem_free(buf, size);
17890                 return (0);
17891         }
17892
17893         case DTRACEIOC_ENABLE: {
17894                 dof_hdr_t *dof;
17895                 dtrace_enabling_t *enab = NULL;
17896                 dtrace_vstate_t *vstate;
17897                 int err = 0;
17898
17899                 *rv = 0;
17900
17901                 /*
17902                  * If a NULL argument has been passed, we take this as our
17903                  * cue to reevaluate our enablings.
17904                  */
17905                 if (arg == 0) {
17906                         dtrace_enabling_matchall();
17907
17908                         return (0);
17909                 }
17910
17911                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17912                         return (rval);
17913
17914                 lck_mtx_lock(&cpu_lock);
17915                 lck_mtx_lock(&dtrace_lock);
17916                 vstate = &state->dts_vstate;
17917
17918                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17919                         lck_mtx_unlock(&dtrace_lock);
17920                         lck_mtx_unlock(&cpu_lock);
17921                         dtrace_dof_destroy(dof);
17922                         return (EBUSY);
17923                 }
17924
17925                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17926                         lck_mtx_unlock(&dtrace_lock);
17927                         lck_mtx_unlock(&cpu_lock);
17928                         dtrace_dof_destroy(dof);
17929                         return (EINVAL);
17930                 }
17931
17932                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17933                         dtrace_enabling_destroy(enab);
17934                         lck_mtx_unlock(&dtrace_lock);
17935                         lck_mtx_unlock(&cpu_lock);
17936                         dtrace_dof_destroy(dof);
17937                         return (rval);
17938                 }
17939
17940                 if ((err = dtrace_enabling_match(enab, rv, NULL)) == 0) {
17941                         err = dtrace_enabling_retain(enab);
17942                 } else {
17943                         dtrace_enabling_destroy(enab);
17944                 }
17945
17946                 lck_mtx_unlock(&dtrace_lock);
17947                 lck_mtx_unlock(&cpu_lock);
17948                 dtrace_dof_destroy(dof);
17949
17950                 return (err);
17951         }
17952
17953         case DTRACEIOC_REPLICATE: {
17954                 dtrace_repldesc_t desc;
17955                 dtrace_probedesc_t *match = &desc.dtrpd_match;
17956                 dtrace_probedesc_t *create = &desc.dtrpd_create;
17957                 int err;
17958
17959                 if (copyin(arg, &desc, sizeof (desc)) != 0)
17960                         return (EFAULT);
17961
17962                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17963                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17964                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17965                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17966
17967                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17968                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17969                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17970                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17971
17972                 lck_mtx_lock(&dtrace_lock);
17973                 err = dtrace_enabling_replicate(state, match, create);
17974                 lck_mtx_unlock(&dtrace_lock);
17975
17976                 return (err);
17977         }
17978
17979         case DTRACEIOC_PROBEMATCH:
17980         case DTRACEIOC_PROBES: {
17981                 dtrace_probe_t *probe = NULL;
17982                 dtrace_probedesc_t desc;
17983                 dtrace_probekey_t pkey;
17984                 dtrace_id_t i;
17985                 int m = 0;
17986                 uint32_t priv;
17987                 uid_t uid;
17988                 zoneid_t zoneid;
17989
17990                 if (copyin(arg, &desc, sizeof (desc)) != 0)
17991                         return (EFAULT);
17992
17993                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17994                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17995                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17996                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17997
17998                 /*
17999                  * Before we attempt to match this probe, we want to give
18000                  * all providers the opportunity to provide it.
18001                  */
18002                 if (desc.dtpd_id == DTRACE_IDNONE) {
18003                         lck_mtx_lock(&dtrace_provider_lock);
18004                         dtrace_probe_provide(&desc, NULL);
18005                         lck_mtx_unlock(&dtrace_provider_lock);
18006                         desc.dtpd_id++;
18007                 }
18008
18009                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
18010
18011                 lck_mtx_lock(&dtrace_lock);
18012
18013                 if (cmd == DTRACEIOC_PROBEMATCH)  {
18014                         dtrace_probekey(&desc, &pkey);
18015                         pkey.dtpk_id = DTRACE_IDNONE;
18016
18017                         /* Quiet compiler warning */
18018                         for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18019                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
18020                                         (m = dtrace_match_probe(probe, &pkey,
18021                                         priv, uid, zoneid)) != 0)
18022                                         break;
18023                         }
18024
18025                         if (m < 0) {
18026                                 lck_mtx_unlock(&dtrace_lock);
18027                                 return (EINVAL);
18028                         }
18029                         dtrace_probekey_release(&pkey);
18030
18031                 } else {
18032                         /* Quiet compiler warning */
18033                         for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18034                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
18035                                         dtrace_match_priv(probe, priv, uid, zoneid))
18036                                         break;
18037                         }
18038                 }
18039
18040                 if (probe == NULL) {
18041                         lck_mtx_unlock(&dtrace_lock);
18042                         return (ESRCH);
18043                 }
18044
18045                 dtrace_probe_description(probe, &desc);
18046                 lck_mtx_unlock(&dtrace_lock);
18047
18048                 if (copyout(&desc, arg, sizeof (desc)) != 0)
18049                         return (EFAULT);
18050
18051                 return (0);
18052         }
18053
18054         case DTRACEIOC_PROBEARG: {
18055                 dtrace_argdesc_t desc;
18056                 dtrace_probe_t *probe;
18057                 dtrace_provider_t *prov;
18058
18059                 if (copyin(arg, &desc, sizeof (desc)) != 0)
18060                         return (EFAULT);
18061
18062                 if (desc.dtargd_id == DTRACE_IDNONE)
18063                         return (EINVAL);
18064
18065                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
18066                         return (EINVAL);
18067
18068                 lck_mtx_lock(&dtrace_provider_lock);
18069                 lck_mtx_lock(&mod_lock);
18070                 lck_mtx_lock(&dtrace_lock);
18071
18072                 /* Quiet compiler warning */
18073                 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
18074                         lck_mtx_unlock(&dtrace_lock);
18075                         lck_mtx_unlock(&mod_lock);
18076                         lck_mtx_unlock(&dtrace_provider_lock);
18077                         return (EINVAL);
18078                 }
18079
18080                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
18081                         lck_mtx_unlock(&dtrace_lock);
18082                         lck_mtx_unlock(&mod_lock);
18083                         lck_mtx_unlock(&dtrace_provider_lock);
18084                         return (EINVAL);
18085                 }
18086
18087                 lck_mtx_unlock(&dtrace_lock);
18088
18089                 prov = probe->dtpr_provider;
18090
18091                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
18092                 /*
18093                  * There isn't any typed information for this probe.
18094                  * Set the argument number to DTRACE_ARGNONE.
18095                  */
18096                         desc.dtargd_ndx = DTRACE_ARGNONE;
18097                 } else {
18098                         desc.dtargd_native[0] = '\0';
18099                         desc.dtargd_xlate[0] = '\0';
18100                         desc.dtargd_mapping = desc.dtargd_ndx;
18101
18102                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
18103                         probe->dtpr_id, probe->dtpr_arg, &desc);
18104                 }
18105
18106                 lck_mtx_unlock(&mod_lock);
18107                 lck_mtx_unlock(&dtrace_provider_lock);
18108
18109                 if (copyout(&desc, arg, sizeof (desc)) != 0)
18110                         return (EFAULT);
18111
18112                 return (0);
18113         }
18114
18115         case DTRACEIOC_GO: {
18116                 processorid_t cpuid;
18117                 rval = dtrace_state_go(state, &cpuid);
18118
18119                 if (rval != 0)
18120                         return (rval);
18121
18122                 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18123                         return (EFAULT);
18124
18125                 return (0);
18126         }
18127
18128         case DTRACEIOC_STOP: {
18129                 processorid_t cpuid;
18130
18131                 lck_mtx_lock(&dtrace_lock);
18132                 rval = dtrace_state_stop(state, &cpuid);
18133                 lck_mtx_unlock(&dtrace_lock);
18134
18135                 if (rval != 0)
18136                         return (rval);
18137
18138                 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18139                         return (EFAULT);
18140
18141                 return (0);
18142         }
18143
18144         case DTRACEIOC_DOFGET: {
18145                 dof_hdr_t hdr, *dof;
18146                 uint64_t len;
18147
18148                 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
18149                         return (EFAULT);
18150
18151                 lck_mtx_lock(&dtrace_lock);
18152                 dof = dtrace_dof_create(state);
18153                 lck_mtx_unlock(&dtrace_lock);
18154
18155                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
18156                 rval = copyout(dof, arg, len);
18157                 dtrace_dof_destroy(dof);
18158
18159                 return (rval == 0 ? 0 : EFAULT);
18160         }
18161
18162         case DTRACEIOC_SLEEP: {
18163                 int64_t time;
18164                 uint64_t abstime;
18165                 uint64_t rvalue = DTRACE_WAKE_TIMEOUT;
18166
18167                 if (copyin(arg, &time, sizeof(time)) != 0)
18168                         return (EFAULT);
18169
18170                 nanoseconds_to_absolutetime((uint64_t)time, &abstime);
18171                 clock_absolutetime_interval_to_deadline(abstime, &abstime);
18172
18173                 if (assert_wait_deadline(state, THREAD_ABORTSAFE, abstime) == THREAD_WAITING) {
18174                         if (state->dts_buf_over_limit > 0) {
18175                                 clear_wait(current_thread(), THREAD_INTERRUPTED);
18176                                 rvalue = DTRACE_WAKE_BUF_LIMIT;
18177                         } else {
18178                                 thread_block(THREAD_CONTINUE_NULL);
18179                                 if (state->dts_buf_over_limit > 0) {
18180                                         rvalue = DTRACE_WAKE_BUF_LIMIT;
18181                                 }
18182                         }
18183                 }
18184
18185                 if (copyout(&rvalue, arg, sizeof(rvalue)) != 0)
18186                         return (EFAULT);
18187
18188                 return (0);
18189         }
18190
18191         case DTRACEIOC_SIGNAL: {
18192                 wakeup(state);
18193                 return (0);
18194         }
18195
18196         case DTRACEIOC_AGGSNAP:
18197         case DTRACEIOC_BUFSNAP: {
18198                 dtrace_bufdesc_t desc;
18199                 caddr_t cached;
18200                 boolean_t over_limit;
18201                 dtrace_buffer_t *buf;
18202
18203                 if (copyin(arg, &desc, sizeof (desc)) != 0)
18204                         return (EFAULT);
18205
18206                 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
18207                         return (EINVAL);
18208
18209                 lck_mtx_lock(&dtrace_lock);
18210
18211                 if (cmd == DTRACEIOC_BUFSNAP) {
18212                         buf = &state->dts_buffer[desc.dtbd_cpu];
18213                 } else {
18214                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
18215                 }
18216
18217                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
18218                         size_t sz = buf->dtb_offset;
18219
18220                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
18221                                 lck_mtx_unlock(&dtrace_lock);
18222                                 return (EBUSY);
18223                         }
18224
18225                         /*
18226                          * If this buffer has already been consumed, we're
18227                          * going to indicate that there's nothing left here
18228                          * to consume.
18229                          */
18230                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
18231                                 lck_mtx_unlock(&dtrace_lock);
18232
18233                                 desc.dtbd_size = 0;
18234                                 desc.dtbd_drops = 0;
18235                                 desc.dtbd_errors = 0;
18236                                 desc.dtbd_oldest = 0;
18237                                 sz = sizeof (desc);
18238
18239                                 if (copyout(&desc, arg, sz) != 0)
18240                                         return (EFAULT);
18241
18242                                 return (0);
18243                         }
18244
18245                         /*
18246                          * If this is a ring buffer that has wrapped, we want
18247                          * to copy the whole thing out.
18248                          */
18249                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
18250                                 dtrace_buffer_polish(buf);
18251                                 sz = buf->dtb_size;
18252                         }
18253
18254                         if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
18255                                 lck_mtx_unlock(&dtrace_lock);
18256                                 return (EFAULT);
18257                         }
18258
18259                         desc.dtbd_size = sz;
18260                         desc.dtbd_drops = buf->dtb_drops;
18261                         desc.dtbd_errors = buf->dtb_errors;
18262                         desc.dtbd_oldest = buf->dtb_xamot_offset;
18263                         desc.dtbd_timestamp = dtrace_gethrtime();
18264
18265                         lck_mtx_unlock(&dtrace_lock);
18266
18267                         if (copyout(&desc, arg, sizeof (desc)) != 0)
18268                                 return (EFAULT);
18269
18270                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
18271
18272                         return (0);
18273                 }
18274
18275                 if (buf->dtb_tomax == NULL) {
18276                         ASSERT(buf->dtb_xamot == NULL);
18277                         lck_mtx_unlock(&dtrace_lock);
18278                         return (ENOENT);
18279                 }
18280
18281                 cached = buf->dtb_tomax;
18282                 over_limit = buf->dtb_cur_limit == buf->dtb_size;
18283
18284                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18285
18286                 dtrace_xcall(desc.dtbd_cpu,
18287                         (dtrace_xcall_t)dtrace_buffer_switch, buf);
18288
18289                 state->dts_errors += buf->dtb_xamot_errors;
18290
18291                 /*
18292                 * If the buffers did not actually switch, then the cross call
18293                 * did not take place -- presumably because the given CPU is
18294                 * not in the ready set.  If this is the case, we'll return
18295                 * ENOENT.
18296                 */
18297                 if (buf->dtb_tomax == cached) {
18298                         ASSERT(buf->dtb_xamot != cached);
18299                         lck_mtx_unlock(&dtrace_lock);
18300                         return (ENOENT);
18301                 }
18302
18303                 ASSERT(cached == buf->dtb_xamot);
18304                 /*
18305                  * At this point we know the buffer have switched, so we
18306                  * can decrement the over limit count if the buffer was over
18307                  * its limit. The new buffer might already be over its limit
18308                  * yet, but we don't care since we're guaranteed not to be
18309                  * checking the buffer over limit count  at this point.
18310                  */
18311                 if (over_limit) {
18312                         uint32_t old = os_atomic_dec_orig(&state->dts_buf_over_limit, relaxed);
18313                         #pragma unused(old)
18314
18315                         /*
18316                          * Verify that we didn't underflow the value
18317                          */
18318                         ASSERT(old != 0);
18319                 }
18320
18321                 /*
18322                 * We have our snapshot; now copy it out.
18323                 */
18324                 if (dtrace_buffer_copyout(buf->dtb_xamot,
18325                                         (user_addr_t)desc.dtbd_data,
18326                                         buf->dtb_xamot_offset) != 0) {
18327                         lck_mtx_unlock(&dtrace_lock);
18328                         return (EFAULT);
18329                 }
18330
18331                 desc.dtbd_size = buf->dtb_xamot_offset;
18332                 desc.dtbd_drops = buf->dtb_xamot_drops;
18333                 desc.dtbd_errors = buf->dtb_xamot_errors;
18334                 desc.dtbd_oldest = 0;
18335                 desc.dtbd_timestamp = buf->dtb_switched;
18336
18337                 lck_mtx_unlock(&dtrace_lock);
18338
18339                 /*
18340                  * Finally, copy out the buffer description.
18341                  */
18342                 if (copyout(&desc, arg, sizeof (desc)) != 0)
18343                         return (EFAULT);
18344
18345                 return (0);
18346         }
18347
18348         case DTRACEIOC_CONF: {
18349                 dtrace_conf_t conf;
18350
18351                 bzero(&conf, sizeof (conf));
18352                 conf.dtc_difversion = DIF_VERSION;
18353                 conf.dtc_difintregs = DIF_DIR_NREGS;
18354                 conf.dtc_diftupregs = DIF_DTR_NREGS;
18355                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18356
18357                 if (copyout(&conf, arg, sizeof (conf)) != 0)
18358                         return (EFAULT);
18359
18360                 return (0);
18361         }
18362
18363         case DTRACEIOC_STATUS: {
18364                 dtrace_status_t stat;
18365                 dtrace_dstate_t *dstate;
18366                 int i, j;
18367                 uint64_t nerrs;
18368
18369                 /*
18370                 * See the comment in dtrace_state_deadman() for the reason
18371                 * for setting dts_laststatus to INT64_MAX before setting
18372                 * it to the correct value.
18373                 */
18374                 state->dts_laststatus = INT64_MAX;
18375                 dtrace_membar_producer();
18376                 state->dts_laststatus = dtrace_gethrtime();
18377
18378                 bzero(&stat, sizeof (stat));
18379
18380                 lck_mtx_lock(&dtrace_lock);
18381
18382                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18383                         lck_mtx_unlock(&dtrace_lock);
18384                         return (ENOENT);
18385                 }
18386
18387                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18388                         stat.dtst_exiting = 1;
18389
18390                 nerrs = state->dts_errors;
18391                 dstate = &state->dts_vstate.dtvs_dynvars;
18392
18393                 for (i = 0; i < (int)NCPU; i++) {
18394                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
18395
18396                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
18397                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18398                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18399
18400                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18401                                 stat.dtst_filled++;
18402
18403                         nerrs += state->dts_buffer[i].dtb_errors;
18404
18405                         for (j = 0; j < state->dts_nspeculations; j++) {
18406                                 dtrace_speculation_t *spec;
18407                                 dtrace_buffer_t *buf;
18408
18409                                 spec = &state->dts_speculations[j];
18410                                 buf = &spec->dtsp_buffer[i];
18411                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
18412                         }
18413                 }
18414
18415                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
18416                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18417                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18418                 stat.dtst_dblerrors = state->dts_dblerrors;
18419                 stat.dtst_killed =
18420                         (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18421                 stat.dtst_errors = nerrs;
18422
18423                 lck_mtx_unlock(&dtrace_lock);
18424
18425                 if (copyout(&stat, arg, sizeof (stat)) != 0)
18426                         return (EFAULT);
18427
18428                 return (0);
18429         }
18430
18431         case DTRACEIOC_FORMAT: {
18432                 dtrace_fmtdesc_t fmt;
18433                 char *str;
18434                 int len;
18435
18436                 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
18437                         return (EFAULT);
18438
18439                 lck_mtx_lock(&dtrace_lock);
18440
18441                 if (fmt.dtfd_format == 0 ||
18442                         fmt.dtfd_format > state->dts_nformats) {
18443                         lck_mtx_unlock(&dtrace_lock);
18444                         return (EINVAL);
18445                 }
18446
18447                 /*
18448                  * Format strings are allocated contiguously and they are
18449                  * never freed; if a format index is less than the number
18450                  * of formats, we can assert that the format map is non-NULL
18451                  * and that the format for the specified index is non-NULL.
18452                  */
18453                 ASSERT(state->dts_formats != NULL);
18454                 str = state->dts_formats[fmt.dtfd_format - 1]->dtf_str;
18455                 ASSERT(str != NULL);
18456
18457                 len = strlen(str) + 1;
18458
18459                 if (len > fmt.dtfd_length) {
18460                         fmt.dtfd_length = len;
18461
18462                         if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
18463                                 lck_mtx_unlock(&dtrace_lock);
18464                                 return (EINVAL);
18465                         }
18466                 } else {
18467                         if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
18468                                 lck_mtx_unlock(&dtrace_lock);
18469                                 return (EINVAL);
18470                         }
18471                 }
18472
18473                 lck_mtx_unlock(&dtrace_lock);
18474                 return (0);
18475         }
18476
18477         case DTRACEIOC_MODUUIDSLIST: {
18478                 size_t module_uuids_list_size;
18479                 dtrace_module_uuids_list_t* uuids_list;
18480                 uint64_t dtmul_count;
18481
18482                 /*
18483                  * Security restrictions make this operation illegal, if this is enabled DTrace
18484                  * must refuse to provide any fbt probes.
18485                  */
18486                 if (dtrace_fbt_probes_restricted()) {
18487                         cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18488                         return (EPERM);
18489                 }
18490
18491                 /*
18492                  * Fail if the kernel symbol mode makes this operation illegal.
18493                  * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18494                  * for them without holding the dtrace_lock.
18495                  */
18496                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18497                     dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18498                         cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
18499                         return (EPERM);
18500                 }
18501
18502                 /*
18503                  * Read the number of symbolsdesc structs being passed in.
18504                  */
18505                 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
18506                            &dtmul_count,
18507                            sizeof(dtmul_count))) {
18508                         cmn_err(CE_WARN, "failed to copyin dtmul_count");
18509                         return (EFAULT);
18510                 }
18511
18512                 /*
18513                  * Range check the count. More than 2k kexts is probably an error.
18514                  */
18515                 if (dtmul_count > 2048) {
18516                         cmn_err(CE_WARN, "dtmul_count is not valid");
18517                         return (EINVAL);
18518                 }
18519
18520                 /*
18521                  * For all queries, we return EINVAL when the user specified
18522                  * count does not match the actual number of modules we find
18523                  * available.
18524                  *
18525                  * If the user specified count is zero, then this serves as a
18526                  * simple query to count the available modules in need of symbols.
18527                  */
18528
18529                 rval = 0;
18530
18531                 if (dtmul_count == 0)
18532                 {
18533                         lck_mtx_lock(&mod_lock);
18534                         struct modctl* ctl = dtrace_modctl_list;
18535                         while (ctl) {
18536                                 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18537                                 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18538                                         dtmul_count++;
18539                                         rval = EINVAL;
18540                                 }
18541                                 ctl = ctl->mod_next;
18542                         }
18543                         lck_mtx_unlock(&mod_lock);
18544
18545                         if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
18546                                 return (EFAULT);
18547                         else
18548                                 return (rval);
18549                 }
18550
18551                 /*
18552                  * If we reach this point, then we have a request for full list data.
18553                  * Allocate a correctly sized structure and copyin the data.
18554                  */
18555                 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
18556                 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
18557                         return (ENOMEM);
18558
18559                 /* NOTE! We can no longer exit this method via return */
18560                 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
18561                         cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
18562                         rval = EFAULT;
18563                         goto moduuidslist_cleanup;
18564                 }
18565
18566                 /*
18567                  * Check that the count didn't change between the first copyin and the second.
18568                  */
18569                 if (uuids_list->dtmul_count != dtmul_count) {
18570                         rval = EINVAL;
18571                         goto moduuidslist_cleanup;
18572                 }
18573
18574                 /*
18575                  * Build the list of UUID's that need symbols
18576                  */
18577                 lck_mtx_lock(&mod_lock);
18578
18579                 dtmul_count = 0;
18580
18581                 struct modctl* ctl = dtrace_modctl_list;
18582                 while (ctl) {
18583                         /*
18584                          * We assume that userspace symbols will be "better" than kernel level symbols,
18585                          * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
18586                          * are available, add user syms if the module might use them.
18587                          */
18588                         ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18589                         if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18590                                 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
18591                                 if (dtmul_count++ < uuids_list->dtmul_count) {
18592                                         memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
18593                                 }
18594                         }
18595                         ctl = ctl->mod_next;
18596                 }
18597
18598                 lck_mtx_unlock(&mod_lock);
18599
18600                 if (uuids_list->dtmul_count < dtmul_count)
18601                         rval = EINVAL;
18602
18603                 uuids_list->dtmul_count = dtmul_count;
18604
18605                 /*
18606                  * Copyout the symbols list (or at least the count!)
18607                  */
18608                 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
18609                         cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
18610                         rval = EFAULT;
18611                 }
18612
18613         moduuidslist_cleanup:
18614                 /*
18615                  * If we had to allocate struct memory, free it.
18616                  */
18617                 if (uuids_list != NULL) {
18618                         kmem_free(uuids_list, module_uuids_list_size);
18619                 }
18620
18621                 return rval;
18622         }
18623
18624         case DTRACEIOC_PROVMODSYMS: {
18625                 size_t module_symbols_size;
18626                 dtrace_module_symbols_t* module_symbols;
18627                 uint64_t dtmodsyms_count;
18628
18629                 /*
18630                  * Security restrictions make this operation illegal, if this is enabled DTrace
18631                  * must refuse to provide any fbt probes.
18632                  */
18633                 if (dtrace_fbt_probes_restricted()) {
18634                         cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18635                         return (EPERM);
18636                 }
18637
18638                 /*
18639                  * Fail if the kernel symbol mode makes this operation illegal.
18640                  * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18641                  * for them without holding the dtrace_lock.
18642                  */
18643                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18644                     dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18645                         cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
18646                         return (EPERM);
18647                 }
18648
18649                 /*
18650                  * Read the number of module symbols structs being passed in.
18651                  */
18652                 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
18653                            &dtmodsyms_count,
18654                            sizeof(dtmodsyms_count))) {
18655                         cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
18656                         return (EFAULT);
18657                 }
18658
18659                 /*
18660                  * Range check the count. How much data can we pass around?
18661                  * FIX ME!
18662                  */
18663                 if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) {
18664                         cmn_err(CE_WARN, "dtmodsyms_count is not valid");
18665                         return (EINVAL);
18666                 }
18667
18668                 /*
18669                  * Allocate a correctly sized structure and copyin the data.
18670                  */
18671                 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
18672                 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
18673                         return (ENOMEM);
18674
18675                 rval = 0;
18676
18677                 /* NOTE! We can no longer exit this method via return */
18678                 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
18679                         cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t");
18680                         rval = EFAULT;
18681                         goto module_symbols_cleanup;
18682                 }
18683
18684                 /*
18685                  * Check that the count didn't change between the first copyin and the second.
18686                  */
18687                 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
18688                         rval = EINVAL;
18689                         goto module_symbols_cleanup;
18690                 }
18691
18692                 /*
18693                  * Find the modctl to add symbols to.
18694                  */
18695                 lck_mtx_lock(&dtrace_provider_lock);
18696                 lck_mtx_lock(&mod_lock);
18697
18698                 struct modctl* ctl = dtrace_modctl_list;
18699                 while (ctl) {
18700                         ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18701                         if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
18702                                 dtrace_provider_t *prv;
18703                                 ctl->mod_user_symbols = module_symbols;
18704
18705                                 /*
18706                                  * We're going to call each providers per-module provide operation
18707                                  * specifying only this module.
18708                                  */
18709                                 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
18710                                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
18711                                 /*
18712                                  * We gave every provider a chance to provide with the user syms, go ahead and clear them
18713                                  */
18714                                 ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
18715                         }
18716                         ctl = ctl->mod_next;
18717                 }
18718
18719                 lck_mtx_unlock(&mod_lock);
18720                 lck_mtx_unlock(&dtrace_provider_lock);
18721
18722         module_symbols_cleanup:
18723                 /*
18724                  * If we had to allocate struct memory, free it.
18725                  */
18726                 if (module_symbols != NULL) {
18727                         kmem_free(module_symbols, module_symbols_size);
18728                 }
18729
18730                 return rval;
18731         }
18732
18733         case DTRACEIOC_PROCWAITFOR: {
18734                 dtrace_procdesc_t pdesc = {
18735                         .p_name = {0},
18736                         .p_pid  = -1
18737                 };
18738
18739                 if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != 0)
18740                         goto proc_waitfor_error;
18741
18742                 if ((rval = dtrace_proc_waitfor(&pdesc)) != 0)
18743                         goto proc_waitfor_error;
18744
18745                 if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != 0)
18746                         goto proc_waitfor_error;
18747
18748                 return 0;
18749
18750         proc_waitfor_error:
18751                 /* The process was suspended, revert this since the client will not do it. */
18752                 if (pdesc.p_pid != -1) {
18753                         proc_t *proc = proc_find(pdesc.p_pid);
18754                         if (proc != PROC_NULL) {
18755                                 task_pidresume(proc->task);
18756                                 proc_rele(proc);
18757                         }
18758                 }
18759
18760                 return rval;
18761         }
18762
18763         default:
18764                 break;
18765         }
18766
18767         return (ENOTTY);
18768 }
18769
18770 /*
18771  * APPLE NOTE:  dtrace_detach not implemented
18772  */
18773 #if !defined(__APPLE__)
18774 /*ARGSUSED*/
18775 static int
18776 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18777 {
18778         dtrace_state_t *state;
18779
18780         switch (cmd) {
18781         case DDI_DETACH:
18782                 break;
18783
18784         case DDI_SUSPEND:
18785                 return (DDI_SUCCESS);
18786
18787         default:
18788                 return (DDI_FAILURE);
18789         }
18790
18791         lck_mtx_lock(&cpu_lock);
18792         lck_mtx_lock(&dtrace_provider_lock);
18793         lck_mtx_lock(&dtrace_lock);
18794
18795         ASSERT(dtrace_opens == 0);
18796
18797         if (dtrace_helpers > 0) {
18798                 lck_mtx_unlock(&dtrace_lock);
18799                 lck_mtx_unlock(&dtrace_provider_lock);
18800                 lck_mtx_unlock(&cpu_lock);
18801                 return (DDI_FAILURE);
18802         }
18803
18804         if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
18805                 lck_mtx_unlock(&dtrace_lock);
18806                 lck_mtx_unlock(&dtrace_provider_lock);
18807                 lck_mtx_unlock(&cpu_lock);
18808                 return (DDI_FAILURE);
18809         }
18810
18811         dtrace_provider = NULL;
18812
18813         if ((state = dtrace_anon_grab()) != NULL) {
18814                 /*
18815                  * If there were ECBs on this state, the provider should
18816                  * have not been allowed to detach; assert that there is
18817                  * none.
18818                  */
18819                 ASSERT(state->dts_necbs == 0);
18820                 dtrace_state_destroy(state);
18821
18822                 /*
18823                  * If we're being detached with anonymous state, we need to
18824                  * indicate to the kernel debugger that DTrace is now inactive.
18825                  */
18826                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18827         }
18828
18829         bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18830         unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18831         dtrace_cpu_init = NULL;
18832         dtrace_helpers_cleanup = NULL;
18833         dtrace_helpers_fork = NULL;
18834         dtrace_cpustart_init = NULL;
18835         dtrace_cpustart_fini = NULL;
18836         dtrace_debugger_init = NULL;
18837         dtrace_debugger_fini = NULL;
18838         dtrace_kreloc_init = NULL;
18839         dtrace_kreloc_fini = NULL;
18840         dtrace_modload = NULL;
18841         dtrace_modunload = NULL;
18842
18843         lck_mtx_unlock(&cpu_lock);
18844
18845         if (dtrace_helptrace_enabled) {
18846                 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
18847                 dtrace_helptrace_buffer = NULL;
18848         }
18849
18850         kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
18851         dtrace_probes = NULL;
18852         dtrace_nprobes = 0;
18853
18854         dtrace_hash_destroy(dtrace_strings);
18855         dtrace_hash_destroy(dtrace_byprov);
18856         dtrace_hash_destroy(dtrace_bymod);
18857         dtrace_hash_destroy(dtrace_byfunc);
18858         dtrace_hash_destroy(dtrace_byname);
18859         dtrace_strings = NULL;
18860         dtrace_byprov = NULL;
18861         dtrace_bymod = NULL;
18862         dtrace_byfunc = NULL;
18863         dtrace_byname = NULL;
18864
18865         kmem_cache_destroy(dtrace_state_cache);
18866         vmem_destroy(dtrace_arena);
18867
18868         if (dtrace_toxrange != NULL) {
18869                 kmem_free(dtrace_toxrange,
18870                     dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
18871                 dtrace_toxrange = NULL;
18872                 dtrace_toxranges = 0;
18873                 dtrace_toxranges_max = 0;
18874         }
18875
18876         ddi_remove_minor_node(dtrace_devi, NULL);
18877         dtrace_devi = NULL;
18878
18879         ddi_soft_state_fini(&dtrace_softstate);
18880
18881         ASSERT(dtrace_vtime_references == 0);
18882         ASSERT(dtrace_opens == 0);
18883         ASSERT(dtrace_retained == NULL);
18884
18885         lck_mtx_unlock(&dtrace_lock);
18886         lck_mtx_unlock(&dtrace_provider_lock);
18887
18888 #ifdef illumos
18889         /*
18890          * We don't destroy the task queue until after we have dropped our
18891          * locks (taskq_destroy() may block on running tasks).  To prevent
18892          * attempting to do work after we have effectively detached but before
18893          * the task queue has been destroyed, all tasks dispatched via the
18894          * task queue must check that DTrace is still attached before
18895          * performing any operation.
18896          */
18897         taskq_destroy(dtrace_taskq);
18898         dtrace_taskq = NULL;
18899 #endif
18900
18901         return (DDI_SUCCESS);
18902 }
18903 #endif  /* __APPLE__ */
18904
18905 d_open_t _dtrace_open, helper_open;
18906 d_close_t _dtrace_close, helper_close;
18907 d_ioctl_t _dtrace_ioctl, helper_ioctl;
18908
18909 int
18910 _dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
18911 {
18912 #pragma unused(p)
18913         dev_t locdev = dev;
18914
18915         return  dtrace_open( &locdev, flags, devtype, CRED());
18916 }
18917
18918 int
18919 helper_open(dev_t dev, int flags, int devtype, struct proc *p)
18920 {
18921 #pragma unused(dev,flags,devtype,p)
18922         return 0;
18923 }
18924
18925 int
18926 _dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
18927 {
18928 #pragma unused(p)
18929         return dtrace_close( dev, flags, devtype, CRED());
18930 }
18931
18932 int
18933 helper_close(dev_t dev, int flags, int devtype, struct proc *p)
18934 {
18935 #pragma unused(dev,flags,devtype,p)
18936         return 0;
18937 }
18938
18939 int
18940 _dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18941 {
18942 #pragma unused(p)
18943         int err, rv = 0;
18944     user_addr_t uaddrp;
18945
18946     if (proc_is64bit(p))
18947                 uaddrp = *(user_addr_t *)data;
18948         else
18949                 uaddrp = (user_addr_t) *(uint32_t *)data;
18950
18951         err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
18952
18953         /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
18954         if (err != 0) {
18955                 ASSERT( (err & 0xfffff000) == 0 );
18956                 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
18957         } else if (rv != 0) {
18958                 ASSERT( (rv & 0xfff00000) == 0 );
18959                 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
18960         } else
18961                 return 0;
18962 }
18963
18964 int
18965 helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18966 {
18967 #pragma unused(dev,fflag,p)
18968         int err, rv = 0;
18969
18970         err = dtrace_ioctl_helper(cmd, data, &rv);
18971         /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
18972         if (err != 0) {
18973                 ASSERT( (err & 0xfffff000) == 0 );
18974                 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
18975         } else if (rv != 0) {
18976                 ASSERT( (rv & 0xfff00000) == 0 );
18977                 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
18978         } else
18979                 return 0;
18980 }
18981
18982 #define HELPER_MAJOR  -24 /* let the kernel pick the device number */
18983
18984 /*
18985  * A struct describing which functions will get invoked for certain
18986  * actions.
18987  */
18988 static struct cdevsw helper_cdevsw =
18989 {
18990         helper_open,            /* open */
18991         helper_close,           /* close */
18992         eno_rdwrt,                      /* read */
18993         eno_rdwrt,                      /* write */
18994         helper_ioctl,           /* ioctl */
18995         (stop_fcn_t *)nulldev, /* stop */
18996         (reset_fcn_t *)nulldev, /* reset */
18997         NULL,                           /* tty's */
18998         eno_select,                     /* select */
18999         eno_mmap,                       /* mmap */
19000         eno_strat,                      /* strategy */
19001         eno_getc,                       /* getc */
19002         eno_putc,                       /* putc */
19003         0                                       /* type */
19004 };
19005
19006 static int helper_majdevno = 0;
19007
19008 static int gDTraceInited = 0;
19009
19010 void
19011 helper_init( void )
19012 {
19013         /*
19014          * Once the "helper" is initialized, it can take ioctl calls that use locks
19015          * and zones initialized in dtrace_init. Make certain dtrace_init was called
19016          * before us.
19017          */
19018
19019         if (!gDTraceInited) {
19020                 panic("helper_init before dtrace_init\n");
19021         }
19022
19023         if (0 >= helper_majdevno)
19024         {
19025                 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
19026
19027                 if (helper_majdevno < 0) {
19028                         printf("helper_init: failed to allocate a major number!\n");
19029                         return;
19030                 }
19031
19032                 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19033                                         DTRACEMNR_HELPER, 0 )) {
19034                         printf("dtrace_init: failed to devfs_make_node for helper!\n");
19035                         return;
19036                 }
19037         } else
19038                 panic("helper_init: called twice!\n");
19039 }
19040
19041 #undef HELPER_MAJOR
19042
19043 static int
19044 dtrace_clone_func(dev_t dev, int action)
19045 {
19046 #pragma unused(dev)
19047
19048         if (action == DEVFS_CLONE_ALLOC) {
19049                 return dtrace_state_reserve();
19050         }
19051         else if (action == DEVFS_CLONE_FREE) {
19052                 return 0;
19053         }
19054         else return -1;
19055 }
19056
19057 void dtrace_ast(void);
19058
19059 void
19060 dtrace_ast(void)
19061 {
19062         int i;
19063         uint32_t clients = os_atomic_xchg(&dtrace_wake_clients, 0, relaxed);
19064         if (clients == 0)
19065                 return;
19066         /**
19067          * We disable preemption here to be sure that we won't get
19068          * interrupted by a wakeup to a thread that is higher
19069          * priority than us, so that we do issue all wakeups
19070          */
19071         disable_preemption();
19072         for (i = 0; i < DTRACE_NCLIENTS; i++) {
19073                 if (clients & (1 << i)) {
19074                         dtrace_state_t *state = dtrace_state_get(i);
19075                         if (state) {
19076                                 wakeup(state);
19077                         }
19078
19079                 }
19080         }
19081         enable_preemption();
19082 }
19083
19084
19085 #define DTRACE_MAJOR  -24 /* let the kernel pick the device number */
19086
19087 static struct cdevsw dtrace_cdevsw =
19088 {
19089         _dtrace_open,           /* open */
19090         _dtrace_close,          /* close */
19091         eno_rdwrt,                      /* read */
19092         eno_rdwrt,                      /* write */
19093         _dtrace_ioctl,          /* ioctl */
19094         (stop_fcn_t *)nulldev, /* stop */
19095         (reset_fcn_t *)nulldev, /* reset */
19096         NULL,                           /* tty's */
19097         eno_select,                     /* select */
19098         eno_mmap,                       /* mmap */
19099         eno_strat,                      /* strategy */
19100         eno_getc,                       /* getc */
19101         eno_putc,                       /* putc */
19102         0                                       /* type */
19103 };
19104
19105 lck_attr_t* dtrace_lck_attr;
19106 lck_grp_attr_t* dtrace_lck_grp_attr;
19107 lck_grp_t* dtrace_lck_grp;
19108
19109 static int gMajDevNo;
19110
19111 void dtrace_early_init (void)
19112 {
19113         dtrace_restriction_policy_load();
19114
19115         /*
19116          * See dtrace_impl.h for a description of kernel symbol modes.
19117          * The default is to wait for symbols from userspace (lazy symbols).
19118          */
19119         if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
19120                 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
19121         }
19122 }
19123
19124 void
19125 dtrace_init( void )
19126 {
19127         if (0 == gDTraceInited) {
19128                 int i, ncpu;
19129                 size_t size = sizeof(dtrace_buffer_memory_maxsize);
19130
19131                 /*
19132                  * DTrace allocates buffers based on the maximum number
19133                  * of enabled cpus. This call avoids any race when finding
19134                  * that count.
19135                  */
19136                 ASSERT(dtrace_max_cpus == 0);
19137                 ncpu = dtrace_max_cpus = ml_get_max_cpus();
19138
19139                 /*
19140                  * Retrieve the size of the physical memory in order to define
19141                  * the state buffer memory maximal size.  If we cannot retrieve
19142                  * this value, we'll consider that we have 1Gb of memory per CPU, that's
19143                  * still better than raising a kernel panic.
19144                  */
19145                 if (0 != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
19146                                              &size, NULL, 0))
19147                 {
19148                         dtrace_buffer_memory_maxsize = ncpu * 1024 * 1024 * 1024;
19149                         printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
19150                                dtrace_buffer_memory_maxsize);
19151                 }
19152
19153                 /*
19154                  * Finally, divide by three to prevent DTrace from eating too
19155                  * much memory.
19156                  */
19157                 dtrace_buffer_memory_maxsize /= 3;
19158                 ASSERT(dtrace_buffer_memory_maxsize > 0);
19159
19160                 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
19161
19162                 if (gMajDevNo < 0) {
19163                         printf("dtrace_init: failed to allocate a major number!\n");
19164                         gDTraceInited = 0;
19165                         return;
19166                 }
19167
19168                 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19169                                         dtrace_clone_func, DTRACEMNR_DTRACE, 0 )) {
19170                         printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
19171                         gDTraceInited = 0;
19172                         return;
19173                 }
19174
19175                 /*
19176                  * Allocate the dtrace_probe_t zone
19177                  */
19178                 dtrace_probe_t_zone = zinit(sizeof(dtrace_probe_t),
19179                                             1024 * sizeof(dtrace_probe_t),
19180                                             sizeof(dtrace_probe_t),
19181                                             "dtrace.dtrace_probe_t");
19182
19183                 /*
19184                  * Create the dtrace lock group and attrs.
19185                  */
19186                 dtrace_lck_attr = lck_attr_alloc_init();
19187                 dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
19188                 dtrace_lck_grp = lck_grp_alloc_init("dtrace",  dtrace_lck_grp_attr);
19189
19190                 /*
19191                  * We have to initialize all locks explicitly
19192                  */
19193                 lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
19194                 lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
19195                 lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
19196                 lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr);
19197 #if DEBUG
19198                 lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
19199 #endif
19200                 lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
19201
19202                 /*
19203                  * The cpu_core structure consists of per-CPU state available in any context.
19204                  * On some architectures, this may mean that the page(s) containing the
19205                  * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
19206                  * is up to the platform to assure that this is performed properly.  Note that
19207                  * the structure is sized to avoid false sharing.
19208                  */
19209                 lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
19210                 lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr);
19211                 lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
19212
19213                 /*
19214                  * Initialize the CPU offline/online hooks.
19215                  */
19216                 dtrace_install_cpu_hooks();
19217
19218                 dtrace_modctl_list = NULL;
19219
19220                 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
19221                 for (i = 0; i < ncpu; ++i) {
19222                         lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
19223                 }
19224
19225                 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
19226                 for (i = 0; i < ncpu; ++i) {
19227                         cpu_list[i].cpu_id = (processorid_t)i;
19228                         cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
19229                         LIST_INIT(&cpu_list[i].cpu_cyc_list);
19230                         lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
19231                 }
19232
19233                 lck_mtx_lock(&cpu_lock);
19234                 for (i = 0; i < ncpu; ++i)
19235                         /* FIXME: track CPU configuration */
19236                         dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
19237                 lck_mtx_unlock(&cpu_lock);
19238
19239                 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
19240
19241                 dtrace_strings = dtrace_hash_create(dtrace_strkey_offset,
19242                     offsetof(dtrace_string_t, dtst_str),
19243                     offsetof(dtrace_string_t, dtst_next),
19244                     offsetof(dtrace_string_t, dtst_prev));
19245
19246                 dtrace_isa_init();
19247                 /*
19248                  * See dtrace_impl.h for a description of dof modes.
19249                  * The default is lazy dof.
19250                  *
19251                  * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
19252                  * makes no sense...
19253                  */
19254                 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
19255 #if CONFIG_EMBEDDED
19256                         /* Disable DOF mode by default for performance reasons */
19257                         dtrace_dof_mode = DTRACE_DOF_MODE_NEVER;
19258 #else
19259                         dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
19260 #endif
19261                 }
19262
19263                 /*
19264                  * Sanity check of dof mode value.
19265                  */
19266                 switch (dtrace_dof_mode) {
19267                         case DTRACE_DOF_MODE_NEVER:
19268                         case DTRACE_DOF_MODE_LAZY_ON:
19269                                 /* valid modes, but nothing else we need to do */
19270                                 break;
19271
19272                         case DTRACE_DOF_MODE_LAZY_OFF:
19273                         case DTRACE_DOF_MODE_NON_LAZY:
19274                                 /* Cannot wait for a dtrace_open to init fasttrap */
19275                                 fasttrap_init();
19276                                 break;
19277
19278                         default:
19279                                 /* Invalid, clamp to non lazy */
19280                                 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
19281                                 fasttrap_init();
19282                                 break;
19283                 }
19284
19285 #if CONFIG_DTRACE
19286         if (dtrace_dof_mode != DTRACE_DOF_MODE_NEVER)
19287             commpage_update_dof(true);
19288 #endif
19289
19290                 gDTraceInited = 1;
19291
19292         } else
19293                 panic("dtrace_init: called twice!\n");
19294 }
19295
19296 void
19297 dtrace_postinit(void)
19298 {
19299         /*
19300          * Called from bsd_init after all provider's *_init() routines have been
19301          * run. That way, anonymous DOF enabled under dtrace_attach() is safe
19302          * to go.
19303          */
19304         dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0)); /* Punning a dev_t to a dev_info_t* */
19305
19306         /*
19307          * Add the mach_kernel to the module list for lazy processing
19308          */
19309         struct kmod_info fake_kernel_kmod;
19310         memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
19311
19312         strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
19313         fake_kernel_kmod.id = 1;
19314         fake_kernel_kmod.address = g_kernel_kmod_info.address;
19315         fake_kernel_kmod.size = g_kernel_kmod_info.size;
19316
19317         if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) {
19318                 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
19319         }
19320
19321         (void)OSKextRegisterKextsWithDTrace();
19322 }
19323 #undef DTRACE_MAJOR
19324
19325 /*
19326  * Routines used to register interest in cpu's being added to or removed
19327  * from the system.
19328  */
19329 void
19330 register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19331 {
19332 #pragma unused(ignore1,ignore2)
19333 }
19334
19335 void
19336 unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19337 {
19338 #pragma unused(ignore1,ignore2)
19339 }