4  * The contents of this file are subject to the terms of the 
   5  * Common Development and Distribution License (the "License"). 
   6  * You may not use this file except in compliance with the License. 
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 
   9  * or http://www.opensolaris.org/os/licensing. 
  10  * See the License for the specific language governing permissions 
  11  * and limitations under the License. 
  13  * When distributing Covered Code, include this CDDL HEADER in each 
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 
  15  * If applicable, add the following below this CDDL HEADER, with the 
  16  * fields enclosed by brackets "[]" replaced with your own identifying 
  17  * information: Portions Copyright [yyyy] [name of copyright owner] 
  23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved. 
  24  * Use is subject to license terms. 
  26  * Portions Copyright (c) 2012 by Delphix. All rights reserved. 
  27  * Portions Copyright (c) 2016 by Joyent, Inc. 
  30 #ifndef _SYS_DTRACE_IMPL_H 
  31 #define _SYS_DTRACE_IMPL_H 
  33 /* #pragma ident        "@(#)dtrace_impl.h      1.23    07/02/16 SMI" */ 
  40  * DTrace Dynamic Tracing Software: Kernel Implementation Interfaces 
  42  * Note: The contents of this file are private to the implementation of the 
  43  * Solaris system and DTrace subsystem and are subject to change at any time 
  44  * without notice.  Applications and drivers using these interfaces will fail 
  45  * to run on future releases.  These interfaces should not be used for any 
  46  * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB). 
  47  * Please refer to the "Solaris Dynamic Tracing Guide" for more information. 
  50 #include <sys/dtrace.h> 
  53  * DTrace Implementation Locks 
  55 extern lck_mtx_t dtrace_procwaitfor_lock
; 
  58  * DTrace Implementation Constants and Typedefs 
  60 #define DTRACE_MAXPROPLEN               128 
  61 #define DTRACE_DYNVAR_CHUNKSIZE         256 
  65 struct dtrace_predicate
; 
  67 struct dtrace_provider
; 
  70 typedef struct dtrace_probe dtrace_probe_t
; 
  71 typedef struct dtrace_ecb dtrace_ecb_t
; 
  72 typedef struct dtrace_predicate dtrace_predicate_t
; 
  73 typedef struct dtrace_action dtrace_action_t
; 
  74 typedef struct dtrace_provider dtrace_provider_t
; 
  75 typedef struct dtrace_meta dtrace_meta_t
; 
  76 typedef struct dtrace_state dtrace_state_t
; 
  77 typedef uint32_t dtrace_optid_t
; 
  78 typedef uint32_t dtrace_specid_t
; 
  79 typedef uint64_t dtrace_genid_t
; 
  84  * The probe is the fundamental unit of the DTrace architecture.  Probes are 
  85  * created by DTrace providers, and managed by the DTrace framework.  A probe 
  86  * is identified by a unique <provider, module, function, name> tuple, and has 
  87  * a unique probe identifier assigned to it.  (Some probes are not associated 
  88  * with a specific point in text; these are called _unanchored probes_ and have 
  89  * no module or function associated with them.)  Probes are represented as a 
  90  * dtrace_probe structure.  To allow quick lookups based on each element of the 
  91  * probe tuple, probes are hashed by each of provider, module, function and 
  92  * name.  (If a lookup is performed based on a regular expression, a 
  93  * dtrace_probekey is prepared, and a linear search is performed.) Each probe 
  94  * is additionally pointed to by a linear array indexed by its identifier.  The 
  95  * identifier is the provider's mechanism for indicating to the DTrace 
  96  * framework that a probe has fired:  the identifier is passed as the first 
  97  * argument to dtrace_probe(), where it is then mapped into the corresponding 
  98  * dtrace_probe structure.  From the dtrace_probe structure, dtrace_probe() can 
  99  * iterate over the probe's list of enabling control blocks; see "DTrace 
 100  * Enabling Control Blocks", below.) 
 102 struct dtrace_probe 
{ 
 103         dtrace_id_t dtpr_id
;                    /* probe identifier */ 
 104         dtrace_ecb_t 
*dtpr_ecb
;                 /* ECB list; see below */ 
 105         dtrace_ecb_t 
*dtpr_ecb_last
;            /* last ECB in list */ 
 106         void *dtpr_arg
;                         /* provider argument */ 
 107         dtrace_cacheid_t dtpr_predcache
;        /* predicate cache ID */ 
 108         int dtpr_aframes
;                       /* artificial frames */ 
 109         dtrace_provider_t 
*dtpr_provider
;       /* pointer to provider */ 
 110         char *dtpr_mod
;                         /* probe's module name */ 
 111         char *dtpr_func
;                        /* probe's function name */ 
 112         char *dtpr_name
;                        /* probe's name */ 
 113         dtrace_probe_t 
*dtpr_nextmod
;           /* next in module hash */ 
 114         dtrace_probe_t 
*dtpr_prevmod
;           /* previous in module hash */ 
 115         dtrace_probe_t 
*dtpr_nextfunc
;          /* next in function hash */ 
 116         dtrace_probe_t 
*dtpr_prevfunc
;          /* previous in function hash */ 
 117         dtrace_probe_t 
*dtpr_nextname
;          /* next in name hash */ 
 118         dtrace_probe_t 
*dtpr_prevname
;          /* previous in name hash */ 
 119         dtrace_genid_t dtpr_gen
;                /* probe generation ID */ 
 122 typedef int dtrace_probekey_f(const char *, const char *, int); 
 124 typedef struct dtrace_probekey 
{ 
 125         const char *dtpk_prov
;                  /* provider name to match */ 
 126         dtrace_probekey_f 
*dtpk_pmatch
;         /* provider matching function */ 
 127         const char *dtpk_mod
;                   /* module name to match */ 
 128         dtrace_probekey_f 
*dtpk_mmatch
;         /* module matching function */ 
 129         const char *dtpk_func
;                  /* func name to match */ 
 130         dtrace_probekey_f 
*dtpk_fmatch
;         /* func matching function */ 
 131         const char *dtpk_name
;                  /* name to match */ 
 132         dtrace_probekey_f 
*dtpk_nmatch
;         /* name matching function */ 
 133         dtrace_id_t dtpk_id
;                    /* identifier to match */ 
 136 typedef struct dtrace_hashbucket 
{ 
 137         struct dtrace_hashbucket 
*dthb_next
;    /* next on hash chain */ 
 138         dtrace_probe_t 
*dthb_chain
;             /* chain of probes */ 
 139         int dthb_len
;                           /* number of probes here */ 
 140 } dtrace_hashbucket_t
; 
 142 typedef struct dtrace_hash 
{ 
 143         dtrace_hashbucket_t 
**dth_tab
;          /* hash table */ 
 144         int dth_size
;                           /* size of hash table */ 
 145         int dth_mask
;                           /* mask to index into table */ 
 146         int dth_nbuckets
;                       /* total number of buckets */ 
 147         uintptr_t dth_nextoffs
;                 /* offset of next in probe */ 
 148         uintptr_t dth_prevoffs
;                 /* offset of prev in probe */ 
 149         uintptr_t dth_stroffs
;                  /* offset of str in probe */ 
 153  * DTrace Enabling Control Blocks 
 155  * When a provider wishes to fire a probe, it calls into dtrace_probe(), 
 156  * passing the probe identifier as the first argument.  As described above, 
 157  * dtrace_probe() maps the identifier into a pointer to a dtrace_probe_t 
 158  * structure.  This structure contains information about the probe, and a 
 159  * pointer to the list of Enabling Control Blocks (ECBs).  Each ECB points to 
 160  * DTrace consumer state, and contains an optional predicate, and a list of 
 161  * actions.  (Shown schematically below.)  The ECB abstraction allows a single 
 162  * probe to be multiplexed across disjoint consumers, or across disjoint 
 163  * enablings of a single probe within one consumer. 
 165  *   Enabling Control Block 
 167  * +------------------------+ 
 168  * | dtrace_epid_t ---------+--------------> Enabled Probe ID (EPID) 
 169  * | dtrace_state_t * ------+--------------> State associated with this ECB 
 170  * | dtrace_predicate_t * --+---------+ 
 171  * | dtrace_action_t * -----+----+    | 
 172  * | dtrace_ecb_t * ---+    |    |    |       Predicate (if any) 
 173  * +-------------------+----+    |    |       dtrace_predicate_t 
 174  *                     |         |    +---> +--------------------+ 
 175  *                     |         |          | dtrace_difo_t * ---+----> DIFO 
 176  *                     |         |          +--------------------+ 
 178  *            Next ECB |         |           Action 
 179  *            (if any) |         |       dtrace_action_t 
 180  *                     :         +--> +-------------------+ 
 181  *                     :              | dtrace_actkind_t -+------> kind 
 182  *                     v              | dtrace_difo_t * --+------> DIFO (if any) 
 183  *                                    | dtrace_recdesc_t -+------> record descr. 
 184  *                                    | dtrace_action_t * +------+ 
 185  *                                    +-------------------+      | 
 187  *                               +-------------------------------+  (if any) 
 191  *                               +--> +-------------------+ 
 192  *                                    | dtrace_actkind_t -+------> kind 
 193  *                                    | dtrace_difo_t * --+------> DIFO (if any) 
 194  *                                    | dtrace_action_t * +------+ 
 195  *                                    +-------------------+      | 
 197  *                               +-------------------------------+  (if any) 
 203  * dtrace_probe() iterates over the ECB list.  If the ECB needs less space 
 204  * than is available in the principal buffer, the ECB is processed:  if the 
 205  * predicate is non-NULL, the DIF object is executed.  If the result is 
 206  * non-zero, the action list is processed, with each action being executed 
 207  * accordingly.  When the action list has been completely executed, processing 
 208  * advances to the next ECB. The ECB abstraction allows disjoint consumers 
 209  * to multiplex on single probes. 
 211  * Execution of the ECB results in consuming dte_size bytes in the buffer 
 212  * to record data.  During execution, dte_needed bytes must be available in 
 213  * the buffer.  This space is used for both recorded data and tuple data. 
 216         dtrace_epid_t dte_epid
;                 /* enabled probe ID */ 
 217         uint32_t dte_alignment
;                 /* required alignment */ 
 218         size_t dte_needed
;                      /* space needed for execution */ 
 219         size_t dte_size
;                        /* size of recorded payload */ 
 220         dtrace_predicate_t 
*dte_predicate
;      /* predicate, if any */ 
 221         dtrace_action_t 
*dte_action
;            /* actions, if any */ 
 222         dtrace_ecb_t 
*dte_next
;                 /* next ECB on probe */ 
 223         dtrace_state_t 
*dte_state
;              /* pointer to state */ 
 224         uint32_t dte_cond
;                      /* security condition */ 
 225         dtrace_probe_t 
*dte_probe
;              /* pointer to probe */ 
 226         dtrace_action_t 
*dte_action_last
;       /* last action on ECB */ 
 227         uint64_t dte_uarg
;                      /* library argument */ 
 230 struct dtrace_predicate 
{ 
 231         dtrace_difo_t 
*dtp_difo
;                /* DIF object */ 
 232         dtrace_cacheid_t dtp_cacheid
;           /* cache identifier */ 
 233         int dtp_refcnt
;                         /* reference count */ 
 236 struct dtrace_action 
{ 
 237         dtrace_actkind_t dta_kind
;              /* kind of action */ 
 238         uint16_t dta_intuple
;                   /* boolean:  in aggregation */ 
 239         uint32_t dta_refcnt
;                    /* reference count */ 
 240         dtrace_difo_t 
*dta_difo
;                /* pointer to DIFO */ 
 241         dtrace_recdesc_t dta_rec
;               /* record description */ 
 242         dtrace_action_t 
*dta_prev
;              /* previous action */ 
 243         dtrace_action_t 
*dta_next
;              /* next action */ 
 246 typedef struct dtrace_aggregation 
{ 
 247         dtrace_action_t dtag_action
;            /* action; must be first */ 
 248         dtrace_aggid_t dtag_id
;                 /* identifier */ 
 249         dtrace_ecb_t 
*dtag_ecb
;                 /* corresponding ECB */ 
 250         dtrace_action_t 
*dtag_first
;            /* first action in tuple */ 
 251         uint32_t dtag_base
;                     /* base of aggregation */ 
 252         uint8_t dtag_hasarg
;                    /* boolean:  has argument */ 
 253         uint64_t dtag_initial
;                  /* initial value */ 
 254         void (*dtag_aggregate
)(uint64_t *, uint64_t, uint64_t); 
 255 } dtrace_aggregation_t
; 
 260  * Principal buffers, aggregation buffers, and speculative buffers are all 
 261  * managed with the dtrace_buffer structure.  By default, this structure 
 262  * includes twin data buffers -- dtb_tomax and dtb_xamot -- that serve as the 
 263  * active and passive buffers, respectively.  For speculative buffers, 
 264  * dtb_xamot will be NULL; for "ring" and "fill" buffers, dtb_xamot will point 
 265  * to a scratch buffer.  For all buffer types, the dtrace_buffer structure is 
 266  * always allocated on a per-CPU basis; a single dtrace_buffer structure is 
 267  * never shared among CPUs.  (That is, there is never true sharing of the 
 268  * dtrace_buffer structure; to prevent false sharing of the structure, it must 
 269  * always be aligned to the coherence granularity -- generally 64 bytes.) 
 271  * One of the critical design decisions of DTrace is that a given ECB always 
 272  * stores the same quantity and type of data.  This is done to assure that the 
 273  * only metadata required for an ECB's traced data is the EPID.  That is, from 
 274  * the EPID, the consumer can determine the data layout.  (The data buffer 
 275  * layout is shown schematically below.)  By assuring that one can determine 
 276  * data layout from the EPID, the metadata stream can be separated from the 
 277  * data stream -- simplifying the data stream enormously.  The ECB always 
 278  * proceeds the recorded data as part of the dtrace_rechdr_t structure that 
 279  * includes the EPID and a high-resolution timestamp used for output ordering 
 282  *      base of data buffer --->  +--------+--------------------+--------+ 
 283  *                                | rechdr | data               | rechdr | 
 284  *                                +--------+------+--------+----+--------+ 
 285  *                                | data          | rechdr | data        | 
 286  *                                +---------------+--------+-------------+ 
 288  *                                +--------+--------------------+--------+ 
 289  *                                | rechdr | data               |        | 
 290  *                                +--------+--------------------+        | 
 300  *     limit of data buffer --->  +--------------------------------------+ 
 302  * When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the 
 303  * principal buffer (both scratch and payload) exceed the available space.  If 
 304  * the ECB's needs exceed available space (and if the principal buffer policy 
 305  * is the default "switch" policy), the ECB is dropped, the buffer's drop count 
 306  * is incremented, and processing advances to the next ECB.  If the ECB's needs 
 307  * can be met with the available space, the ECB is processed, but the offset in 
 308  * the principal buffer is only advanced if the ECB completes processing 
 311  * When a buffer is to be switched (either because the buffer is the principal 
 312  * buffer with a "switch" policy or because it is an aggregation buffer), a 
 313  * cross call is issued to the CPU associated with the buffer.  In the cross 
 314  * call context, interrupts are disabled, and the active and the inactive 
 315  * buffers are atomically switched.  This involves switching the data pointers, 
 316  * copying the various state fields (offset, drops, errors, etc.) into their 
 317  * inactive equivalents, and clearing the state fields.  Because interrupts are 
 318  * disabled during this procedure, the switch is guaranteed to appear atomic to 
 321  * DTrace Ring Buffering 
 323  * To process a ring buffer correctly, one must know the oldest valid record. 
 324  * Processing starts at the oldest record in the buffer and continues until 
 325  * the end of the buffer is reached.  Processing then resumes starting with 
 326  * the record stored at offset 0 in the buffer, and continues until the 
 327  * youngest record is processed.  If trace records are of a fixed-length, 
 328  * determining the oldest record is trivial: 
 330  *   - If the ring buffer has not wrapped, the oldest record is the record 
 331  *     stored at offset 0. 
 333  *   - If the ring buffer has wrapped, the oldest record is the record stored 
 334  *     at the current offset. 
 336  * With variable length records, however, just knowing the current offset 
 337  * doesn't suffice for determining the oldest valid record:  assuming that one 
 338  * allows for arbitrary data, one has no way of searching forward from the 
 339  * current offset to find the oldest valid record.  (That is, one has no way 
 340  * of separating data from metadata.) It would be possible to simply refuse to 
 341  * process any data in the ring buffer between the current offset and the 
 342  * limit, but this leaves (potentially) an enormous amount of otherwise valid 
 345  * To effect ring buffering, we track two offsets in the buffer:  the current 
 346  * offset and the _wrapped_ offset.  If a request is made to reserve some 
 347  * amount of data, and the buffer has wrapped, the wrapped offset is 
 348  * incremented until the wrapped offset minus the current offset is greater 
 349  * than or equal to the reserve request.  This is done by repeatedly looking 
 350  * up the ECB corresponding to the EPID at the current wrapped offset, and 
 351  * incrementing the wrapped offset by the size of the data payload 
 352  * corresponding to that ECB.  If this offset is greater than or equal to the 
 353  * limit of the data buffer, the wrapped offset is set to 0.  Thus, the 
 354  * current offset effectively "chases" the wrapped offset around the buffer. 
 357  *      base of data buffer --->  +------+--------------------+------+ 
 358  *                                | EPID | data               | EPID | 
 359  *                                +------+--------+------+----+------+ 
 360  *                                | data          | EPID | data      | 
 361  *                                +---------------+------+-----------+ 
 363  *                                +------+---------------------------+ 
 365  *           current offset --->  +------+---------------------------+ 
 367  *           wrapped offset --->  +------+--------------------+------+ 
 368  *                                | EPID | data               | EPID | 
 369  *                                +------+--------+------+----+------+ 
 370  *                                | data          | EPID | data      | 
 371  *                                +---------------+------+-----------+ 
 374  *                                .        ... valid data ...        . 
 377  *                                +------+-------------+------+------+ 
 378  *                                | EPID | data        | EPID | data | 
 379  *                                +------+------------++------+------+ 
 380  *                                | data, cont.       | leftover     | 
 381  *     limit of data buffer --->  +-------------------+--------------+ 
 383  * If the amount of requested buffer space exceeds the amount of space 
 384  * available between the current offset and the end of the buffer: 
 386  *  (1)  all words in the data buffer between the current offset and the limit 
 387  *       of the data buffer (marked "leftover", above) are set to 
 390  *  (2)  the wrapped offset is set to zero 
 392  *  (3)  the iteration process described above occurs until the wrapped offset 
 393  *       is greater than the amount of desired space. 
 395  * The wrapped offset is implemented by (re-)using the inactive offset. 
 396  * In a "switch" buffer policy, the inactive offset stores the offset in 
 397  * the inactive buffer; in a "ring" buffer policy, it stores the wrapped 
 400  * DTrace Scratch Buffering 
 402  * Some ECBs may wish to allocate dynamically-sized temporary scratch memory. 
 403  * To accommodate such requests easily, scratch memory may be allocated in 
 404  * the buffer beyond the current offset plus the needed memory of the current 
 405  * ECB.  If there isn't sufficient room in the buffer for the requested amount 
 406  * of scratch space, the allocation fails and an error is generated.  Scratch 
 407  * memory is tracked in the dtrace_mstate_t and is automatically freed when 
 408  * the ECB ceases processing.  Note that ring buffers cannot allocate their 
 409  * scratch from the principal buffer -- lest they needlessly overwrite older, 
 410  * valid data.  Ring buffers therefore have their own dedicated scratch buffer 
 411  * from which scratch is allocated. 
 413 #define DTRACEBUF_RING          0x0001          /* bufpolicy set to "ring" */ 
 414 #define DTRACEBUF_FILL          0x0002          /* bufpolicy set to "fill" */ 
 415 #define DTRACEBUF_NOSWITCH      0x0004          /* do not switch buffer */ 
 416 #define DTRACEBUF_WRAPPED       0x0008          /* ring buffer has wrapped */ 
 417 #define DTRACEBUF_DROPPED       0x0010          /* drops occurred */ 
 418 #define DTRACEBUF_ERROR         0x0020          /* errors occurred */ 
 419 #define DTRACEBUF_FULL          0x0040          /* "fill" buffer is full */ 
 420 #define DTRACEBUF_CONSUMED      0x0080          /* buffer has been consumed */ 
 421 #define DTRACEBUF_INACTIVE      0x0100          /* buffer is not yet active */ 
 423 typedef struct dtrace_buffer 
{ 
 424         uint64_t dtb_offset
;                    /* current offset in buffer */ 
 425         uint64_t dtb_cur_limit
;                 /* current limit before signaling/dropping */ 
 426         uint64_t dtb_limit
;                     /* limit before signaling */ 
 427         uint64_t dtb_size
;                      /* size of buffer */ 
 428         uint32_t dtb_flags
;                     /* flags */ 
 429         uint32_t dtb_drops
;                     /* number of drops */ 
 430         caddr_t dtb_tomax
;                      /* active buffer */ 
 431         caddr_t dtb_xamot
;                      /* inactive buffer */ 
 432         uint32_t dtb_xamot_flags
;               /* inactive flags */ 
 433         uint32_t dtb_xamot_drops
;               /* drops in inactive buffer */ 
 434         uint64_t dtb_xamot_offset
;              /* offset in inactive buffer */ 
 435         uint32_t dtb_errors
;                    /* number of errors */ 
 436         uint32_t dtb_xamot_errors
;              /* errors in inactive buffer */ 
 440         uint64_t dtb_switched
;                  /* time of last switch */ 
 441         uint64_t dtb_interval
;                  /* observed switch interval */ 
 442         uint64_t dtb_pad2
[4];                   /* pad to avoid false sharing */ 
 446  * DTrace Aggregation Buffers 
 448  * Aggregation buffers use much of the same mechanism as described above 
 449  * ("DTrace Buffers").  However, because an aggregation is fundamentally a 
 450  * hash, there exists dynamic metadata associated with an aggregation buffer 
 451  * that is not associated with other kinds of buffers.  This aggregation 
 452  * metadata is _only_ relevant for the in-kernel implementation of 
 453  * aggregations; it is not actually relevant to user-level consumers.  To do 
 454  * this, we allocate dynamic aggregation data (hash keys and hash buckets) 
 455  * starting below the _limit_ of the buffer, and we allocate data from the 
 456  * _base_ of the buffer.  When the aggregation buffer is copied out, _only_ the 
 457  * data is copied out; the metadata is simply discarded.  Schematically, 
 458  * aggregation buffers look like: 
 460  *      base of data buffer --->  +-------+------+-----------+-------+ 
 461  *                                | aggid | key  | value     | aggid | 
 462  *                                +-------+------+-----------+-------+ 
 464  *                                +-------+-------+-----+------------+ 
 465  *                                | value | aggid | key | value      | 
 466  *                                +-------+------++-----+------+-----+ 
 467  *                                | aggid | key  | value       |     | 
 468  *                                +-------+------+-------------+     | 
 478  *                                |                ||   +------------+ 
 480  *                                +---------------------+            | 
 482  *                                | (dtrace_aggkey structures)       | 
 484  *                                +----------------------------------+ 
 486  *                                | (dtrace_aggbuffer structure)     | 
 488  *     limit of data buffer --->  +----------------------------------+ 
 491  * As implied above, just as we assure that ECBs always store a constant 
 492  * amount of data, we assure that a given aggregation -- identified by its 
 493  * aggregation ID -- always stores data of a constant quantity and type. 
 494  * As with EPIDs, this allows the aggregation ID to serve as the metadata for a 
 497  * Note that the size of the dtrace_aggkey structure must be sizeof (uintptr_t) 
 498  * aligned.  (If this the structure changes such that this becomes false, an 
 499  * assertion will fail in dtrace_aggregate().) 
 501 typedef struct dtrace_aggkey 
{ 
 502         uint32_t dtak_hashval
;                  /* hash value */ 
 503         uint32_t dtak_action
:4;                 /* action -- 4 bits */ 
 504         uint32_t dtak_size
:28;                  /* size -- 28 bits */ 
 505         caddr_t dtak_data
;                      /* data pointer */ 
 506         struct dtrace_aggkey 
*dtak_next
;        /* next in hash chain */ 
 509 typedef struct dtrace_aggbuffer 
{ 
 510         uintptr_t dtagb_hashsize
;               /* number of buckets */ 
 511         uintptr_t dtagb_free
;                   /* free list of keys */ 
 512         dtrace_aggkey_t 
**dtagb_hash
;           /* hash table */ 
 513 } dtrace_aggbuffer_t
; 
 516  * DTrace Speculations 
 518  * Speculations have a per-CPU buffer and a global state.  Once a speculation 
 519  * buffer has been comitted or discarded, it cannot be reused until all CPUs 
 520  * have taken the same action (commit or discard) on their respective 
 521  * speculative buffer.  However, because DTrace probes may execute in arbitrary 
 522  * context, other CPUs cannot simply be cross-called at probe firing time to 
 523  * perform the necessary commit or discard.  The speculation states thus 
 524  * optimize for the case that a speculative buffer is only active on one CPU at 
 525  * the time of a commit() or discard() -- for if this is the case, other CPUs 
 526  * need not take action, and the speculation is immediately available for 
 527  * reuse.  If the speculation is active on multiple CPUs, it must be 
 528  * asynchronously cleaned -- potentially leading to a higher rate of dirty 
 529  * speculative drops.  The speculation states are as follows: 
 531  *  DTRACESPEC_INACTIVE       <= Initial state; inactive speculation 
 532  *  DTRACESPEC_ACTIVE         <= Allocated, but not yet speculatively traced to 
 533  *  DTRACESPEC_ACTIVEONE      <= Speculatively traced to on one CPU 
 534  *  DTRACESPEC_ACTIVEMANY     <= Speculatively traced to on more than one CPU 
 535  *  DTRACESPEC_COMMITTING     <= Currently being commited on one CPU 
 536  *  DTRACESPEC_COMMITTINGMANY <= Currently being commited on many CPUs 
 537  *  DTRACESPEC_DISCARDING     <= Currently being discarded on many CPUs 
 539  * The state transition diagram is as follows: 
 541  *     +----------------------------------------------------------+ 
 544  *     |  +-------------------| COMMITTING |<-----------------+   | 
 545  *     |  |                   +------------+                  |   | 
 546  *     |  | copied spec.            ^             commit() on |   | discard() on 
 547  *     |  | into principal          |              active CPU |   | active CPU 
 550  * +----------+                 +--------+                +-----------+ 
 551  * | INACTIVE |---------------->| ACTIVE |--------------->| ACTIVEONE | 
 552  * +----------+  speculation()  +--------+  speculate()   +-----------+ 
 554  *     |  |                         | discard()               |   | 
 555  *     |  | asynchronously          |            discard() on |   | speculate() 
 556  *     |  | cleaned                 V            inactive CPU |   | on inactive 
 557  *     |  |                   +------------+                  |   | CPU 
 558  *     |  +-------------------| DISCARDING |<-----------------+   | 
 560  *     | asynchronously             ^                             | 
 561  *     | copied spec.               |       discard()             | 
 562  *     | into principal             +------------------------+    | 
 564  *  +----------------+             commit()              +------------+ 
 565  *  | COMMITTINGMANY |<----------------------------------| ACTIVEMANY | 
 566  *  +----------------+                                   +------------+ 
 568 typedef enum dtrace_speculation_state 
{ 
 569         DTRACESPEC_INACTIVE 
= 0, 
 571         DTRACESPEC_ACTIVEONE
, 
 572         DTRACESPEC_ACTIVEMANY
, 
 573         DTRACESPEC_COMMITTING
, 
 574         DTRACESPEC_COMMITTINGMANY
, 
 575         DTRACESPEC_DISCARDING
 
 576 } dtrace_speculation_state_t
; 
 578 typedef struct dtrace_speculation 
{ 
 579         dtrace_speculation_state_t dtsp_state
;  /* current speculation state */ 
 580         int dtsp_cleaning
;                      /* non-zero if being cleaned */ 
 581         dtrace_buffer_t 
*dtsp_buffer
;           /* speculative buffer */ 
 582 } dtrace_speculation_t
; 
 585  * DTrace Dynamic Variables 
 587  * The dynamic variable problem is obviously decomposed into two subproblems: 
 588  * allocating new dynamic storage, and freeing old dynamic storage.  The 
 589  * presence of the second problem makes the first much more complicated -- or 
 590  * rather, the absence of the second renders the first trivial.  This is the 
 591  * case with aggregations, for which there is effectively no deallocation of 
 592  * dynamic storage.  (Or more accurately, all dynamic storage is deallocated 
 593  * when a snapshot is taken of the aggregation.)  As DTrace dynamic variables 
 594  * allow for both dynamic allocation and dynamic deallocation, the 
 595  * implementation of dynamic variables is quite a bit more complicated than 
 596  * that of their aggregation kin. 
 598  * We observe that allocating new dynamic storage is tricky only because the 
 599  * size can vary -- the allocation problem is much easier if allocation sizes 
 600  * are uniform.  We further observe that in D, the size of dynamic variables is 
 601  * actually _not_ dynamic -- dynamic variable sizes may be determined by static 
 602  * analysis of DIF text.  (This is true even of putatively dynamically-sized 
 603  * objects like strings and stacks, the sizes of which are dictated by the 
 604  * "stringsize" and "stackframes" variables, respectively.)  We exploit this by 
 605  * performing this analysis on all DIF before enabling any probes.  For each 
 606  * dynamic load or store, we calculate the dynamically-allocated size plus the 
 607  * size of the dtrace_dynvar structure plus the storage required to key the 
 608  * data.  For all DIF, we take the largest value and dub it the _chunksize_. 
 609  * We then divide dynamic memory into two parts:  a hash table that is wide 
 610  * enough to have every chunk in its own bucket, and a larger region of equal 
 611  * chunksize units.  Whenever we wish to dynamically allocate a variable, we 
 612  * always allocate a single chunk of memory.  Depending on the uniformity of 
 613  * allocation, this will waste some amount of memory -- but it eliminates the 
 614  * non-determinism inherent in traditional heap fragmentation. 
 616  * Dynamic objects are allocated by storing a non-zero value to them; they are 
 617  * deallocated by storing a zero value to them.  Dynamic variables are 
 618  * complicated enormously by being shared between CPUs.  In particular, 
 619  * consider the following scenario: 
 622  *  +---------------------------------+   +---------------------------------+ 
 624  *  | allocates dynamic object a[123] |   |                                 | 
 625  *  | by storing the value 345 to it  |   |                                 | 
 627  *  |                                 |   | wishing to load from object     | 
 628  *  |                                 |   | a[123], performs lookup in      | 
 629  *  |                                 |   | dynamic variable space          | 
 631  *  | deallocates object a[123] by    |   |                                 | 
 632  *  | storing 0 to it                 |   |                                 | 
 634  *  | allocates dynamic object b[567] |   | performs load from a[123]       | 
 635  *  | by storing the value 789 to it  |   |                                 | 
 639  * This is obviously a race in the D program, but there are nonetheless only 
 640  * two valid values for CPU B's load from a[123]:  345 or 0.  Most importantly, 
 641  * CPU B may _not_ see the value 789 for a[123]. 
 643  * There are essentially two ways to deal with this: 
 645  *  (1)  Explicitly spin-lock variables.  That is, if CPU B wishes to load 
 646  *       from a[123], it needs to lock a[123] and hold the lock for the 
 647  *       duration that it wishes to manipulate it. 
 649  *  (2)  Avoid reusing freed chunks until it is known that no CPU is referring 
 652  * The implementation of (1) is rife with complexity, because it requires the 
 653  * user of a dynamic variable to explicitly decree when they are done using it. 
 654  * Were all variables by value, this perhaps wouldn't be debilitating -- but 
 655  * dynamic variables of non-scalar types are tracked by reference.  That is, if 
 656  * a dynamic variable is, say, a string, and that variable is to be traced to, 
 657  * say, the principal buffer, the DIF emulation code returns to the main 
 658  * dtrace_probe() loop a pointer to the underlying storage, not the contents of 
 659  * the storage.  Further, code calling on DIF emulation would have to be aware 
 660  * that the DIF emulation has returned a reference to a dynamic variable that 
 661  * has been potentially locked.  The variable would have to be unlocked after 
 662  * the main dtrace_probe() loop is finished with the variable, and the main 
 663  * dtrace_probe() loop would have to be careful to not call any further DIF 
 664  * emulation while the variable is locked to avoid deadlock.  More generally, 
 665  * if one were to implement (1), DIF emulation code dealing with dynamic 
 666  * variables could only deal with one dynamic variable at a time (lest deadlock 
 667  * result).  To sum, (1) exports too much subtlety to the users of dynamic 
 668  * variables -- increasing maintenance burden and imposing serious constraints 
 669  * on future DTrace development. 
 671  * The implementation of (2) is also complex, but the complexity is more 
 672  * manageable.  We need to be sure that when a variable is deallocated, it is 
 673  * not placed on a traditional free list, but rather on a _dirty_ list.  Once a 
 674  * variable is on a dirty list, it cannot be found by CPUs performing a 
 675  * subsequent lookup of the variable -- but it may still be in use by other 
 676  * CPUs.  To assure that all CPUs that may be seeing the old variable have 
 677  * cleared out of probe context, a dtrace_sync() can be issued.  Once the 
 678  * dtrace_sync() has completed, it can be known that all CPUs are done 
 679  * manipulating the dynamic variable -- the dirty list can be atomically 
 680  * appended to the free list.  Unfortunately, there's a slight hiccup in this 
 681  * mechanism:  dtrace_sync() may not be issued from probe context.  The 
 682  * dtrace_sync() must be therefore issued asynchronously from non-probe 
 683  * context.  For this we rely on the DTrace cleaner, a cyclic that runs at the 
 684  * "cleanrate" frequency.  To ease this implementation, we define several chunk 
 687  *   - Dirty.  Deallocated chunks, not yet cleaned.  Not available. 
 689  *   - Rinsing.  Formerly dirty chunks that are currently being asynchronously 
 690  *     cleaned.  Not available, but will be shortly.  Dynamic variable 
 691  *     allocation may not spin or block for availability, however. 
 693  *   - Clean.  Clean chunks, ready for allocation -- but not on the free list. 
 695  *   - Free.  Available for allocation. 
 697  * Moreover, to avoid absurd contention, _each_ of these lists is implemented 
 698  * on a per-CPU basis.  This is only for performance, not correctness; chunks 
 699  * may be allocated from another CPU's free list.  The algorithm for allocation 
 702  *   (1)  Attempt to atomically allocate from current CPU's free list.  If list 
 703  *        is non-empty and allocation is successful, allocation is complete. 
 705  *   (2)  If the clean list is non-empty, atomically move it to the free list, 
 708  *   (3)  If the dynamic variable space is in the CLEAN state, look for free 
 709  *        and clean lists on other CPUs by setting the current CPU to the next 
 710  *        CPU, and reattempting (1).  If the next CPU is the current CPU (that 
 711  *        is, if all CPUs have been checked), atomically switch the state of 
 712  *        the dynamic variable space based on the following: 
 714  *        - If no free chunks were found and no dirty chunks were found, 
 715  *          atomically set the state to EMPTY. 
 717  *        - If dirty chunks were found, atomically set the state to DIRTY. 
 719  *        - If rinsing chunks were found, atomically set the state to RINSING. 
 721  *   (4)  Based on state of dynamic variable space state, increment appropriate 
 722  *        counter to indicate dynamic drops (if in EMPTY state) vs. dynamic 
 723  *        dirty drops (if in DIRTY state) vs. dynamic rinsing drops (if in 
 724  *        RINSING state).  Fail the allocation. 
 726  * The cleaning cyclic operates with the following algorithm:  for all CPUs 
 727  * with a non-empty dirty list, atomically move the dirty list to the rinsing 
 728  * list.  Perform a dtrace_sync().  For all CPUs with a non-empty rinsing list, 
 729  * atomically move the rinsing list to the clean list.  Perform another 
 730  * dtrace_sync().  By this point, all CPUs have seen the new clean list; the 
 731  * state of the dynamic variable space can be restored to CLEAN. 
 733  * There exist two final races that merit explanation.  The first is a simple 
 737  *  +---------------------------------+   +---------------------------------+ 
 739  *  | allocates dynamic object a[123] |   | allocates dynamic object a[123] | 
 740  *  | by storing the value 345 to it  |   | by storing the value 567 to it  | 
 745  * Again, this is a race in the D program.  It can be resolved by having a[123] 
 746  * hold the value 345 or a[123] hold the value 567 -- but it must be true that 
 747  * a[123] have only _one_ of these values.  (That is, the racing CPUs may not 
 748  * put the same element twice on the same hash chain.)  This is resolved 
 749  * simply:  before the allocation is undertaken, the start of the new chunk's 
 750  * hash chain is noted.  Later, after the allocation is complete, the hash 
 751  * chain is atomically switched to point to the new element.  If this fails 
 752  * (because of either concurrent allocations or an allocation concurrent with a 
 753  * deletion), the newly allocated chunk is deallocated to the dirty list, and 
 754  * the whole process of looking up (and potentially allocating) the dynamic 
 755  * variable is reattempted. 
 757  * The final race is a simple deallocation race: 
 760  *  +---------------------------------+   +---------------------------------+ 
 762  *  | deallocates dynamic object      |   | deallocates dynamic object      | 
 763  *  | a[123] by storing the value 0   |   | a[123] by storing the value 0   | 
 764  *  | to it                           |   | to it                           | 
 769  * Once again, this is a race in the D program, but it is one that we must 
 770  * handle without corrupting the underlying data structures.  Because 
 771  * deallocations require the deletion of a chunk from the middle of a hash 
 772  * chain, we cannot use a single-word atomic operation to remove it.  For this, 
 773  * we add a spin lock to the hash buckets that is _only_ used for deallocations 
 774  * (allocation races are handled as above).  Further, this spin lock is _only_ 
 775  * held for the duration of the delete; before control is returned to the DIF 
 776  * emulation code, the hash bucket is unlocked. 
 778 typedef struct dtrace_key 
{ 
 779         uint64_t dttk_value
;                    /* data value or data pointer */ 
 780         uint64_t dttk_size
;                     /* 0 if by-val, >0 if by-ref */ 
 783 typedef struct dtrace_tuple 
{ 
 784         uint32_t dtt_nkeys
;                     /* number of keys in tuple */ 
 785         uint32_t dtt_pad
;                       /* padding */ 
 786         dtrace_key_t dtt_key
[1];                /* array of tuple keys */ 
 789 typedef struct dtrace_dynvar 
{ 
 790         uint64_t dtdv_hashval
;                  /* hash value -- 0 if free */ 
 791         struct dtrace_dynvar 
*dtdv_next
;        /* next on list or hash chain */ 
 792         void *dtdv_data
;                        /* pointer to data */ 
 793         dtrace_tuple_t dtdv_tuple
;              /* tuple key */ 
 796 typedef enum dtrace_dynvar_op 
{ 
 798         DTRACE_DYNVAR_NOALLOC
, 
 799         DTRACE_DYNVAR_DEALLOC
 
 800 } dtrace_dynvar_op_t
; 
 802 typedef struct dtrace_dynhash 
{ 
 803         dtrace_dynvar_t 
*dtdh_chain
;            /* hash chain for this bucket */ 
 804         uintptr_t dtdh_lock
;                    /* deallocation lock */ 
 806         uintptr_t dtdh_pad
[6];                  /* pad to avoid false sharing */ 
 808         uintptr_t dtdh_pad
[14];                 /* pad to avoid false sharing */ 
 812 typedef struct dtrace_dstate_percpu 
{ 
 813         dtrace_dynvar_t 
*dtdsc_free
;            /* free list for this CPU */ 
 814         dtrace_dynvar_t 
*dtdsc_dirty
;           /* dirty list for this CPU */ 
 815         dtrace_dynvar_t 
*dtdsc_rinsing
;         /* rinsing list for this CPU */ 
 816         dtrace_dynvar_t 
*dtdsc_clean
;           /* clean list for this CPU */ 
 817         uint64_t dtdsc_drops
;                   /* number of capacity drops */ 
 818         uint64_t dtdsc_dirty_drops
;             /* number of dirty drops */ 
 819         uint64_t dtdsc_rinsing_drops
;           /* number of rinsing drops */ 
 821         uint64_t dtdsc_pad
;                     /* pad to avoid false sharing */ 
 823         uint64_t dtdsc_pad
[2];                  /* pad to avoid false sharing */ 
 825 } dtrace_dstate_percpu_t
; 
 827 typedef enum dtrace_dstate_state 
{ 
 828         DTRACE_DSTATE_CLEAN 
= 0, 
 831         DTRACE_DSTATE_RINSING
 
 832 } dtrace_dstate_state_t
; 
 834 typedef struct dtrace_dstate 
{ 
 835         void *dtds_base
;                        /* base of dynamic var. space */ 
 836         size_t dtds_size
;                       /* size of dynamic var. space */ 
 837         size_t dtds_hashsize
;                   /* number of buckets in hash */ 
 838         size_t dtds_chunksize
;                  /* size of each chunk */ 
 839         dtrace_dynhash_t 
*dtds_hash
;            /* pointer to hash table */ 
 840         dtrace_dstate_state_t dtds_state
;       /* current dynamic var. state */ 
 841         dtrace_dstate_percpu_t 
*dtds_percpu
;    /* per-CPU dyn. var. state */ 
 845  * DTrace Variable State 
 847  * The DTrace variable state tracks user-defined variables in its dtrace_vstate 
 848  * structure.  Each DTrace consumer has exactly one dtrace_vstate structure, 
 849  * but some dtrace_vstate structures may exist without a corresponding DTrace 
 850  * consumer (see "DTrace Helpers", below).  As described in <sys/dtrace.h>, 
 851  * user-defined variables can have one of three scopes: 
 853  *  DIFV_SCOPE_GLOBAL  =>  global scope 
 854  *  DIFV_SCOPE_THREAD  =>  thread-local scope (i.e. "self->" variables) 
 855  *  DIFV_SCOPE_LOCAL   =>  clause-local scope (i.e. "this->" variables) 
 857  * The variable state tracks variables by both their scope and their allocation 
 860  *  - The dtvs_globals and dtvs_locals members each point to an array of 
 861  *    dtrace_statvar structures.  These structures contain both the variable 
 862  *    metadata (dtrace_difv structures) and the underlying storage for all 
 863  *    statically allocated variables, including statically allocated 
 864  *    DIFV_SCOPE_GLOBAL variables and all DIFV_SCOPE_LOCAL variables. 
 866  *  - The dtvs_tlocals member points to an array of dtrace_difv structures for 
 867  *    DIFV_SCOPE_THREAD variables.  As such, this array tracks _only_ the 
 868  *    variable metadata for DIFV_SCOPE_THREAD variables; the underlying storage 
 869  *    is allocated out of the dynamic variable space. 
 871  *  - The dtvs_dynvars member is the dynamic variable state associated with the 
 872  *    variable state.  The dynamic variable state (described in "DTrace Dynamic 
 873  *    Variables", above) tracks all DIFV_SCOPE_THREAD variables and all 
 874  *    dynamically-allocated DIFV_SCOPE_GLOBAL variables. 
 876 typedef struct dtrace_statvar 
{ 
 877         uint64_t dtsv_data
;                     /* data or pointer to it */ 
 878         size_t dtsv_size
;                       /* size of pointed-to data */ 
 879         int dtsv_refcnt
;                        /* reference count */ 
 880         dtrace_difv_t dtsv_var
;                 /* variable metadata */ 
 883 typedef struct dtrace_vstate 
{ 
 884         dtrace_state_t 
*dtvs_state
;             /* back pointer to state */ 
 885         dtrace_statvar_t 
**dtvs_globals
;        /* statically-allocated glbls */ 
 886         int dtvs_nglobals
;                      /* number of globals */ 
 887         dtrace_difv_t 
*dtvs_tlocals
;            /* thread-local metadata */ 
 888         int dtvs_ntlocals
;                      /* number of thread-locals */ 
 889         dtrace_statvar_t 
**dtvs_locals
;         /* clause-local data */ 
 890         int dtvs_nlocals
;                       /* number of clause-locals */ 
 891         dtrace_dstate_t dtvs_dynvars
;           /* dynamic variable state */ 
 895  * DTrace Machine State 
 897  * In the process of processing a fired probe, DTrace needs to track and/or 
 898  * cache some per-CPU state associated with that particular firing.  This is 
 899  * state that is always discarded after the probe firing has completed, and 
 900  * much of it is not specific to any DTrace consumer, remaining valid across 
 901  * all ECBs.  This state is tracked in the dtrace_mstate structure. 
 903 #define DTRACE_MSTATE_ARGS              0x00000001 
 904 #define DTRACE_MSTATE_PROBE             0x00000002 
 905 #define DTRACE_MSTATE_EPID              0x00000004 
 906 #define DTRACE_MSTATE_TIMESTAMP         0x00000008 
 907 #define DTRACE_MSTATE_STACKDEPTH        0x00000010 
 908 #define DTRACE_MSTATE_CALLER            0x00000020 
 909 #define DTRACE_MSTATE_IPL               0x00000040 
 910 #define DTRACE_MSTATE_FLTOFFS           0x00000080 
 911 #define DTRACE_MSTATE_WALLTIMESTAMP     0x00000100 
 912 #define DTRACE_MSTATE_USTACKDEPTH       0x00000200 
 913 #define DTRACE_MSTATE_UCALLER           0x00000400 
 914 #define DTRACE_MSTATE_MACHTIMESTAMP     0x00000800 
 916 typedef struct dtrace_mstate 
{ 
 917         uintptr_t dtms_scratch_base
;            /* base of scratch space */ 
 918         uintptr_t dtms_scratch_ptr
;             /* current scratch pointer */ 
 919         size_t dtms_scratch_size
;               /* scratch size */ 
 920         uint32_t dtms_present
;                  /* variables that are present */ 
 921         uint64_t dtms_arg
[5];                   /* cached arguments */ 
 922         dtrace_epid_t dtms_epid
;                /* current EPID */ 
 923         uint64_t dtms_timestamp
;                /* cached timestamp */ 
 924         hrtime_t dtms_walltimestamp
;            /* cached wall timestamp */ 
 925         uint64_t dtms_machtimestamp
;            /* cached mach absolute timestamp */ 
 926         int dtms_stackdepth
;                    /* cached stackdepth */ 
 927         int dtms_ustackdepth
;                   /* cached ustackdepth */ 
 928         struct dtrace_probe 
*dtms_probe
;        /* current probe */ 
 929         uintptr_t dtms_caller
;                  /* cached caller */ 
 930         uint64_t dtms_ucaller
;                  /* cached user-level caller */ 
 931         int dtms_ipl
;                           /* cached interrupt pri lev */ 
 932         int dtms_fltoffs
;                       /* faulting DIFO offset */ 
 933         uintptr_t dtms_strtok
;                  /* saved strtok() pointer */ 
 934         uintptr_t dtms_strtok_limit
;            /* upper bound of strtok ptr */ 
 935         uint32_t dtms_access
;                   /* memory access rights */ 
 936         dtrace_difo_t 
*dtms_difo
;               /* current dif object */ 
 939 #define DTRACE_COND_OWNER       0x1 
 940 #define DTRACE_COND_USERMODE    0x2 
 941 #define DTRACE_COND_ZONEOWNER   0x4 
 943 #define DTRACE_PROBEKEY_MAXDEPTH        8       /* max glob recursion depth */ 
 946  * Access flag used by dtrace_mstate.dtms_access. 
 948 #define DTRACE_ACCESS_KERNEL    0x1             /* the priv to read kmem */ 
 954  * Each DTrace consumer is in one of several states, which (for purposes of 
 955  * avoiding yet-another overloading of the noun "state") we call the current 
 956  * _activity_.  The activity transitions on dtrace_go() (from DTRACIOCGO), on 
 957  * dtrace_stop() (from DTRACIOCSTOP) and on the exit() action.  Activities may 
 958  * only transition in one direction; the activity transition diagram is a 
 959  * directed acyclic graph.  The activity transition diagram is as follows: 
 963  * +----------+                   +--------+                   +--------+ 
 964  * | INACTIVE |------------------>| WARMUP |------------------>| ACTIVE | 
 965  * +----------+   dtrace_go(),    +--------+   dtrace_go(),    +--------+ 
 966  *                before BEGIN        |        after BEGIN       |  |  | 
 968  *                      exit() action |                          |  |  | 
 969  *                     from BEGIN ECB |                          |  |  | 
 972  *                               +----------+     exit() action  |  |  | 
 973  * +-----------------------------| DRAINING |<-------------------+  |  | 
 976  * |                   dtrace_stop(), |                             |  | 
 980  * | +---------+                 +----------+                       |  | 
 981  * | | STOPPED |<----------------| COOLDOWN |<----------------------+  | 
 982  * | +---------+  dtrace_stop(), +----------+     dtrace_stop(),       | 
 983  * |                after END                       before END         | 
 986  * +----------------------------->| KILLED |<--------------------------+ 
 987  *       deadman timeout or       +--------+     deadman timeout or 
 988  *        killed consumer                         killed consumer 
 990  * Note that once a DTrace consumer has stopped tracing, there is no way to 
 991  * restart it; if a DTrace consumer wishes to restart tracing, it must reopen 
 992  * the DTrace pseudodevice. 
 994 typedef enum dtrace_activity 
{ 
 995         DTRACE_ACTIVITY_INACTIVE 
= 0,           /* not yet running */ 
 996         DTRACE_ACTIVITY_WARMUP
,                 /* while starting */ 
 997         DTRACE_ACTIVITY_ACTIVE
,                 /* running */ 
 998         DTRACE_ACTIVITY_DRAINING
,               /* before stopping */ 
 999         DTRACE_ACTIVITY_COOLDOWN
,               /* while stopping */ 
1000         DTRACE_ACTIVITY_STOPPED
,                /* after stopping */ 
1001         DTRACE_ACTIVITY_KILLED                  
/* killed */ 
1002 } dtrace_activity_t
; 
1006  * APPLE NOTE:  DTrace dof modes implementation 
1008  * DTrace has four "dof modes". They are: 
1010  * DTRACE_DOF_MODE_NEVER        Never load any dof, period. 
1011  * DTRACE_DOF_MODE_LAZY_ON      Defer loading dof until later 
1012  * DTRACE_DOF_MODE_LAZY_OFF     Load all deferred dof now, and any new dof  
1013  * DTRACE_DOF_MODE_NON_LAZY     Load all dof immediately. 
1015  * It is legal to transition between the two lazy modes. The NEVER and 
1016  * NON_LAZY modes are permanent, and must not change once set. 
1018  * The current dof mode is kept in dtrace_dof_mode, which is protected by the 
1019  * dtrace_dof_mode_lock. This is a RW lock, reads require shared access, writes 
1020  * require exclusive access. Because NEVER and NON_LAZY are permanent states, 
1021  * it is legal to test for those modes without holding the dof mode lock. 
1023  * Lock ordering is dof mode lock before any dtrace lock, and before the 
1024  * process p_dtrace_sprlock. In general, other locks should not be held when 
1025  * taking the dof mode lock. Acquiring the dof mode lock in exclusive mode 
1026  * will block process fork, exec, and exit, so it should be held exclusive 
1027  * for as short a time as possible. 
1030 #define DTRACE_DOF_MODE_NEVER           0 
1031 #define DTRACE_DOF_MODE_LAZY_ON         1 
1032 #define DTRACE_DOF_MODE_LAZY_OFF        2 
1033 #define DTRACE_DOF_MODE_NON_LAZY        3 
1036  * dtrace kernel symbol modes are used to control when the kernel may dispose of 
1037  * symbol information used by the fbt/sdt provider. The kernel itself, as well as 
1038  * every kext, has symbol table/nlist info that has historically been preserved 
1039  * for dtrace's use. This allowed dtrace to be lazy about allocating fbt/sdt probes, 
1040  * at the expense of keeping the symbol info in the kernel permanently. 
1042  * Starting in 10.7+, fbt probes may be created from userspace, in the same 
1043  * fashion as pid probes. The kernel allows dtrace "first right of refusal" 
1044  * whenever symbol data becomes available (such as a kext load). If dtrace is 
1045  * active, it will immediately read/copy the needed data, and then the kernel 
1046  * may free it. If dtrace is not active, it returns immediately, having done 
1047  * no work or allocations, and the symbol data is freed. Should dtrace need 
1048  * this data later, it is expected that the userspace client will push the 
1049  * data into the kernel via ioctl calls. 
1051  * The kernel symbol modes are used to control what dtrace does with symbol data: 
1053  * DTRACE_KERNEL_SYMBOLS_NEVER                  Effectively disables fbt/sdt 
1054  * DTRACE_KERNEL_SYMBOLS_FROM_KERNEL            Immediately read/copy symbol data 
1055  * DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE         Wait for symbols from userspace 
1056  * DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL     Immediately read/copy symbol data 
1058  * It is legal to transition between DTRACE_KERNEL_SYMBOLS_FROM_KERNEL and  
1059  * DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE. The DTRACE_KERNEL_SYMBOLS_NEVER and 
1060  * DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL are permanent modes, intended to 
1061  * disable fbt probes entirely, or prevent any symbols being loaded from 
1064  * The kernel symbol mode is kept in dtrace_kernel_symbol_mode, which is protected 
1065  * by the dtrace_lock. 
1068 #define DTRACE_KERNEL_SYMBOLS_NEVER                     0 
1069 #define DTRACE_KERNEL_SYMBOLS_FROM_KERNEL               1 
1070 #define DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE            2 
1071 #define DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL        3 
1075  * DTrace Helper Implementation 
1077  * A description of the helper architecture may be found in <sys/dtrace.h>. 
1078  * Each process contains a pointer to its helpers in its p_dtrace_helpers 
1079  * member.  This is a pointer to a dtrace_helpers structure, which contains an 
1080  * array of pointers to dtrace_helper structures, helper variable state (shared 
1081  * among a process's helpers) and a generation count.  (The generation count is 
1082  * used to provide an identifier when a helper is added so that it may be 
1083  * subsequently removed.)  The dtrace_helper structure is self-explanatory, 
1084  * containing pointers to the objects needed to execute the helper.  Note that 
1085  * helpers are _duplicated_ across fork(2), and destroyed on exec(2).  No more 
1086  * than dtrace_helpers_max are allowed per-process. 
1088 #define DTRACE_HELPER_ACTION_USTACK     0 
1089 #define DTRACE_NHELPER_ACTIONS          1 
1091 typedef struct dtrace_helper_action 
{ 
1092         int dtha_generation
;                    /* helper action generation */ 
1093         int dtha_nactions
;                      /* number of actions */ 
1094         dtrace_difo_t 
*dtha_predicate
;          /* helper action predicate */ 
1095         dtrace_difo_t 
**dtha_actions
;           /* array of actions */ 
1096         struct dtrace_helper_action 
*dtha_next
; /* next helper action */ 
1097 } dtrace_helper_action_t
; 
1099 typedef struct dtrace_helper_provider 
{ 
1100         int dthp_generation
;                    /* helper provider generation */ 
1101         uint32_t dthp_ref
;                      /* reference count */ 
1102         dof_helper_t dthp_prov
;                 /* DOF w/ provider and probes */ 
1103 } dtrace_helper_provider_t
; 
1105 typedef struct dtrace_helpers 
{ 
1106         dtrace_helper_action_t 
**dthps_actions
; /* array of helper actions */ 
1107         dtrace_vstate_t dthps_vstate
;           /* helper action var. state */ 
1108         dtrace_helper_provider_t 
**dthps_provs
; /* array of providers */ 
1109         uint_t dthps_nprovs
;                    /* count of providers */ 
1110         uint_t dthps_maxprovs
;                  /* provider array size */ 
1111         int dthps_generation
;                   /* current generation */ 
1112         pid_t dthps_pid
;                        /* pid of associated proc */ 
1113         int dthps_deferred
;                     /* helper in deferred list */ 
1114         struct dtrace_helpers 
*dthps_next
;      /* next pointer */ 
1115         struct dtrace_helpers 
*dthps_prev
;      /* prev pointer */ 
1119  * DTrace Helper Action Tracing 
1121  * Debugging helper actions can be arduous.  To ease the development and 
1122  * debugging of helpers, DTrace contains a tracing-framework-within-a-tracing- 
1123  * framework: helper tracing.  If dtrace_helptrace_enabled is non-zero (which 
1124  * it is by default on DEBUG kernels), all helper activity will be traced to a 
1125  * global, in-kernel ring buffer.  Each entry includes a pointer to the specific 
1126  * helper, the location within the helper, and a trace of all local variables. 
1127  * The ring buffer may be displayed in a human-readable format with the 
1128  * ::dtrace_helptrace mdb(1) dcmd. 
1130 #define DTRACE_HELPTRACE_NEXT   (-1) 
1131 #define DTRACE_HELPTRACE_DONE   (-2) 
1132 #define DTRACE_HELPTRACE_ERR    (-3) 
1135 typedef struct dtrace_helptrace 
{ 
1136         dtrace_helper_action_t  
*dtht_helper
;   /* helper action */ 
1137         int dtht_where
;                         /* where in helper action */ 
1138         int dtht_nlocals
;                       /* number of locals */ 
1139         int dtht_fault
;                         /* type of fault (if any) */ 
1140         int dtht_fltoffs
;                       /* DIF offset */ 
1141         uint64_t dtht_illval
;                   /* faulting value */ 
1142         uint64_t dtht_locals
[1];                /* local variables */ 
1143 } dtrace_helptrace_t
; 
1146  * DTrace Credentials 
1148  * In probe context, we have limited flexibility to examine the credentials 
1149  * of the DTrace consumer that created a particular enabling.  We use 
1150  * the Least Privilege interfaces to cache the consumer's cred pointer and 
1151  * some facts about that credential in a dtrace_cred_t structure. These 
1152  * can limit the consumer's breadth of visibility and what actions the 
1153  * consumer may take. 
1155 #define DTRACE_CRV_ALLPROC              0x01 
1156 #define DTRACE_CRV_KERNEL               0x02 
1157 #define DTRACE_CRV_ALLZONE              0x04 
1159 #define DTRACE_CRV_ALL          (DTRACE_CRV_ALLPROC | DTRACE_CRV_KERNEL | \ 
1162 #define DTRACE_CRA_PROC                         0x0001 
1163 #define DTRACE_CRA_PROC_CONTROL                 0x0002 
1164 #define DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER     0x0004 
1165 #define DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE     0x0008 
1166 #define DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG     0x0010 
1167 #define DTRACE_CRA_KERNEL                       0x0020 
1168 #define DTRACE_CRA_KERNEL_DESTRUCTIVE           0x0040 
1170 #define DTRACE_CRA_ALL          (DTRACE_CRA_PROC | \ 
1171         DTRACE_CRA_PROC_CONTROL | \ 
1172         DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER | \ 
1173         DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE | \ 
1174         DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG | \ 
1175         DTRACE_CRA_KERNEL | \ 
1176         DTRACE_CRA_KERNEL_DESTRUCTIVE) 
1178 typedef struct dtrace_cred 
{ 
1180         uint8_t                 dcr_destructive
; 
1181         uint8_t                 dcr_visible
; 
1182         uint16_t                dcr_action
; 
1186  * DTrace Consumer State 
1188  * Each DTrace consumer has an associated dtrace_state structure that contains 
1189  * its in-kernel DTrace state -- including options, credentials, statistics and 
1190  * pointers to ECBs, buffers, speculations and formats.  A dtrace_state 
1191  * structure is also allocated for anonymous enablings.  When anonymous state 
1192  * is grabbed, the grabbing consumers dts_anon pointer is set to the grabbed 
1193  * dtrace_state structure. 
1195 struct dtrace_state 
{ 
1196         dev_t dts_dev
;                          /* device */ 
1197         int dts_necbs
;                          /* total number of ECBs */ 
1198         dtrace_ecb_t 
**dts_ecbs
;                /* array of ECBs */ 
1199         dtrace_epid_t dts_epid
;                 /* next EPID to allocate */ 
1200         size_t dts_needed
;                      /* greatest needed space */ 
1201         struct dtrace_state 
*dts_anon
;          /* anon. state, if grabbed */ 
1202         dtrace_activity_t dts_activity
;         /* current activity */ 
1203         dtrace_vstate_t dts_vstate
;             /* variable state */ 
1204         dtrace_buffer_t 
*dts_buffer
;            /* principal buffer */ 
1205         dtrace_buffer_t 
*dts_aggbuffer
;         /* aggregation buffer */ 
1206         dtrace_speculation_t 
*dts_speculations
; /* speculation array */ 
1207         int dts_nspeculations
;                  /* number of speculations */ 
1208         int dts_naggregations
;                  /* number of aggregations */ 
1209         dtrace_aggregation_t 
**dts_aggregations
; /* aggregation array */ 
1210         vmem_t 
*dts_aggid_arena
;                /* arena for aggregation IDs */ 
1211         uint64_t dts_errors
;                    /* total number of errors */ 
1212         uint32_t dts_speculations_busy
;         /* number of spec. busy */ 
1213         uint32_t dts_speculations_unavail
;      /* number of spec unavail */ 
1214         uint32_t dts_stkstroverflows
;           /* stack string tab overflows */ 
1215         uint32_t dts_dblerrors
;                 /* errors in ERROR probes */ 
1216         uint32_t dts_reserve
;                   /* space reserved for END */ 
1217         hrtime_t dts_laststatus
;                /* time of last status */ 
1218         cyclic_id_t dts_cleaner
;                /* cleaning cyclic */ 
1219         cyclic_id_t dts_deadman
;                /* deadman cyclic */ 
1220         hrtime_t dts_alive
;                     /* time last alive */ 
1221         char dts_speculates
;                    /* boolean: has speculations */ 
1222         char dts_destructive
;                   /* boolean: has dest. actions */ 
1223         int dts_nformats
;                       /* number of formats */ 
1224         char **dts_formats
;                     /* format string array */ 
1225         dtrace_optval_t dts_options
[DTRACEOPT_MAX
]; /* options */ 
1226         dtrace_cred_t dts_cred
;                 /* credentials */ 
1227         size_t dts_nretained
;                   /* number of retained enabs */ 
1228         uint64_t dts_arg_error_illval
; 
1229         uint32_t dts_buf_over_limit
;            /* number of bufs over dtb_limit */ 
1232 struct dtrace_provider 
{ 
1233         dtrace_pattr_t dtpv_attr
;               /* provider attributes */ 
1234         dtrace_ppriv_t dtpv_priv
;               /* provider privileges */ 
1235         dtrace_pops_t dtpv_pops
;                /* provider operations */ 
1236         char *dtpv_name
;                        /* provider name */ 
1237         void *dtpv_arg
;                         /* provider argument */ 
1238         uint_t dtpv_defunct
;                    /* boolean: defunct provider */ 
1239         struct dtrace_provider 
*dtpv_next
;      /* next provider */ 
1240         uint64_t dtpv_probe_count
;              /* number of associated probes */ 
1241         uint64_t dtpv_ecb_count
;                /* number of associated enabled ECBs */ 
1244 struct dtrace_meta 
{ 
1245         dtrace_mops_t dtm_mops
;                 /* meta provider operations */ 
1246         char *dtm_name
;                         /* meta provider name */ 
1247         void *dtm_arg
;                          /* meta provider user arg */ 
1248         uint64_t dtm_count
;                     /* number of associated providers */ 
1254  * A dtrace_enabling structure is used to track a collection of ECB 
1255  * descriptions -- before they have been turned into actual ECBs.  This is 
1256  * created as a result of DOF processing, and is generally used to generate 
1257  * ECBs immediately thereafter.  However, enablings are also generally 
1258  * retained should the probes they describe be created at a later time; as 
1259  * each new module or provider registers with the framework, the retained 
1260  * enablings are reevaluated, with any new match resulting in new ECBs.  To 
1261  * prevent probes from being matched more than once, the enabling tracks the 
1262  * last probe generation matched, and only matches probes from subsequent 
1265 typedef struct dtrace_enabling 
{ 
1266         dtrace_ecbdesc_t 
**dten_desc
;           /* all ECB descriptions */ 
1267         int dten_ndesc
;                         /* number of ECB descriptions */ 
1268         int dten_maxdesc
;                       /* size of ECB array */ 
1269         dtrace_vstate_t 
*dten_vstate
;           /* associated variable state */ 
1270         dtrace_genid_t dten_probegen
;           /* matched probe generation */ 
1271         dtrace_ecbdesc_t 
*dten_current
;         /* current ECB description */ 
1272         int dten_error
;                         /* current error value */ 
1273         int dten_primed
;                        /* boolean: set if primed */ 
1274         struct dtrace_enabling 
*dten_prev
;      /* previous enabling */ 
1275         struct dtrace_enabling 
*dten_next
;      /* next enabling */ 
1276 } dtrace_enabling_t
; 
1279  * DTrace Anonymous Enablings 
1281  * Anonymous enablings are DTrace enablings that are not associated with a 
1282  * controlling process, but rather derive their enabling from DOF stored as 
1283  * properties in the dtrace.conf file.  If there is an anonymous enabling, a 
1284  * DTrace consumer state and enabling are created on attach.  The state may be 
1285  * subsequently grabbed by the first consumer specifying the "grabanon" 
1286  * option.  As long as an anonymous DTrace enabling exists, dtrace(7D) will 
1289 typedef struct dtrace_anon 
{ 
1290         dtrace_state_t 
*dta_state
;              /* DTrace consumer state */ 
1291         dtrace_enabling_t 
*dta_enabling
;        /* pointer to enabling */ 
1292         processorid_t dta_beganon
;              /* which CPU BEGIN ran on */ 
1296  * DTrace Error Debugging 
1299 #define DTRACE_ERRDEBUG 
1302 #ifdef DTRACE_ERRDEBUG 
1304 typedef struct dtrace_errhash 
{ 
1305         const char      *dter_msg
;      /* error message */ 
1306         int             dter_count
;     /* number of times seen */ 
1309 #define DTRACE_ERRHASHSZ        256     /* must be > number of err msgs */ 
1311 #endif  /* DTRACE_ERRDEBUG */ 
1314  * DTrace Matching pre-conditions 
1316  * Used when matching new probes to discard matching of enablings that 
1317  * doesn't match the condition tested by dmc_func 
1319 typedef struct dtrace_match_cond 
{ 
1320         int (*dmc_func
)(dtrace_probedesc_t
*, void*); 
1322 } dtrace_match_cond_t
; 
1326  * DTrace Toxic Ranges 
1328  * DTrace supports safe loads from probe context; if the address turns out to 
1329  * be invalid, a bit will be set by the kernel indicating that DTrace 
1330  * encountered a memory error, and DTrace will propagate the error to the user 
1331  * accordingly.  However, there may exist some regions of memory in which an 
1332  * arbitrary load can change system state, and from which it is impossible to 
1333  * recover from such a load after it has been attempted.  Examples of this may 
1334  * include memory in which programmable I/O registers are mapped (for which a 
1335  * read may have some implications for the device) or (in the specific case of 
1336  * UltraSPARC-I and -II) the virtual address hole.  The platform is required 
1337  * to make DTrace aware of these toxic ranges; DTrace will then check that 
1338  * target addresses are not in a toxic range before attempting to issue a 
1341 typedef struct dtrace_toxrange 
{ 
1342         uintptr_t       dtt_base
;               /* base of toxic range */ 
1343         uintptr_t       dtt_limit
;              /* limit of toxic range */ 
1344 } dtrace_toxrange_t
; 
1346 extern uint64_t dtrace_getarg(int, int); 
1347 extern int dtrace_getipl(void); 
1348 extern uintptr_t dtrace_caller(int); 
1349 extern uint32_t dtrace_cas32(uint32_t *, uint32_t, uint32_t); 
1350 extern void *dtrace_casptr(void *, void *, void *); 
1351 extern void dtrace_copyin(user_addr_t
, uintptr_t, size_t, volatile uint16_t *); 
1352 extern void dtrace_copyinstr(user_addr_t
, uintptr_t, size_t, volatile uint16_t *); 
1353 extern void dtrace_copyout(uintptr_t, user_addr_t
, size_t, volatile uint16_t *); 
1354 extern void dtrace_copyoutstr(uintptr_t, user_addr_t
, size_t, volatile uint16_t *); 
1355 extern void dtrace_getpcstack(pc_t 
*, int, int, uint32_t *); 
1356 extern uint64_t dtrace_getreg(struct regs 
*, uint_t
); 
1357 extern int dtrace_getstackdepth(int); 
1358 extern void dtrace_getupcstack(uint64_t *, int); 
1359 extern void dtrace_getufpstack(uint64_t *, uint64_t *, int); 
1360 extern int dtrace_getustackdepth(void); 
1361 extern uintptr_t dtrace_fulword(void *); 
1362 extern uint8_t dtrace_fuword8(user_addr_t
); 
1363 extern uint16_t dtrace_fuword16(user_addr_t
); 
1364 extern uint32_t dtrace_fuword32(user_addr_t
); 
1365 extern uint64_t dtrace_fuword64(user_addr_t
); 
1366 extern int dtrace_proc_waitfor(dtrace_procdesc_t
*); 
1367 extern void dtrace_probe_error(dtrace_state_t 
*, dtrace_epid_t
, int, int, 
1369 extern int dtrace_assfail(const char *, const char *, int); 
1370 extern int dtrace_attached(void); 
1371 extern hrtime_t 
dtrace_gethrestime(void); 
1372 extern void dtrace_isa_init(void); 
1374 extern void dtrace_copy(uintptr_t, uintptr_t, size_t); 
1375 extern void dtrace_copystr(uintptr_t, uintptr_t, size_t, volatile uint16_t *); 
1378  * DTrace state handling 
1380 extern minor_t 
dtrace_state_reserve(void); 
1381 extern dtrace_state_t
* dtrace_state_allocate(minor_t minor
); 
1382 extern dtrace_state_t
* dtrace_state_get(minor_t minor
); 
1383 extern void dtrace_state_free(minor_t minor
); 
1386  * DTrace restriction checks 
1388 extern void dtrace_restriction_policy_load(void); 
1389 extern boolean_t 
dtrace_is_restricted(void); 
1390 extern boolean_t 
dtrace_are_restrictions_relaxed(void); 
1391 extern boolean_t 
dtrace_fbt_probes_restricted(void); 
1392 extern boolean_t 
dtrace_sdt_probes_restricted(void); 
1393 extern boolean_t 
dtrace_can_attach_to_proc(proc_t
); 
1398  * DTrace calls ASSERT and VERIFY from probe context.  To assure that a failed 
1399  * ASSERT or VERIFYdoes not induce a markedly more catastrophic failure (e.g., 
1400  * one from which a dump cannot be gleaned), DTrace must define its own ASSERT 
1401  * and VERIFY macros to be ones that may safely be called from probe context. 
1402  * This header file must thus be included by any DTrace component that calls 
1403  * ASSERT and/or VERIFY from probe context, and _only_ by those components. 
1404  * (The only exception to this is kernel debugging infrastructure at user-level 
1405  * that doesn't depend on calling ASSERT.) 
1410 #define VERIFY(EX)      ((void)((EX) || \ 
1411                         dtrace_assfail(#EX, __FILE__, __LINE__))) 
1414 #define ASSERT(EX)      ((void)((EX) || \ 
1415                         dtrace_assfail(#EX, __FILE__, __LINE__))) 
1417 #define ASSERT(X)       ((void)0) 
1424 #endif /* _SYS_DTRACE_IMPL_H */