deps/jemalloc/include/jemalloc/internal/prof.h

   1 #ifdef JEMALLOC_PROF
   2 /******************************************************************************/
   3 #ifdef JEMALLOC_H_TYPES
   4
   5 typedef struct prof_bt_s prof_bt_t;
   6 typedef struct prof_cnt_s prof_cnt_t;
   7 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
   8 typedef struct prof_ctx_s prof_ctx_t;
   9 typedef struct prof_tdata_s prof_tdata_t;
  10
  11 /* Option defaults. */
  12 #define PROF_PREFIX_DEFAULT             "jeprof"
  13 #define LG_PROF_BT_MAX_DEFAULT          7
  14 #define LG_PROF_SAMPLE_DEFAULT          0
  15 #define LG_PROF_INTERVAL_DEFAULT        -1
  16 #define LG_PROF_TCMAX_DEFAULT           -1
  17
  18 /*
  19  * Hard limit on stack backtrace depth.  Note that the version of
  20  * prof_backtrace() that is based on __builtin_return_address() necessarily has
  21  * a hard-coded number of backtrace frame handlers.
  22  */
  23 #if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
  24 #  define LG_PROF_BT_MAX        ((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
  25 #else
  26 #  define LG_PROF_BT_MAX        7 /* >= LG_PROF_BT_MAX_DEFAULT */
  27 #endif
  28 #define PROF_BT_MAX             (1U << LG_PROF_BT_MAX)
  29
  30 /* Initial hash table size. */
  31 #define PROF_CKH_MINITEMS       64
  32
  33 /* Size of memory buffer to use when writing dump files. */
  34 #define PROF_DUMP_BUF_SIZE      65536
  35
  36 #endif /* JEMALLOC_H_TYPES */
  37 /******************************************************************************/
  38 #ifdef JEMALLOC_H_STRUCTS
  39
  40 struct prof_bt_s {
  41         /* Backtrace, stored as len program counters. */
  42         void            **vec;
  43         unsigned        len;
  44 };
  45
  46 #ifdef JEMALLOC_PROF_LIBGCC
  47 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
  48 typedef struct {
  49         prof_bt_t       *bt;
  50         unsigned        nignore;
  51         unsigned        max;
  52 } prof_unwind_data_t;
  53 #endif
  54
  55 struct prof_cnt_s {
  56         /*
  57          * Profiling counters.  An allocation/deallocation pair can operate on
  58          * different prof_thr_cnt_t objects that are linked into the same
  59          * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
  60          * negative.  In principle it is possible for the *bytes counters to
  61          * overflow/underflow, but a general solution would require something
  62          * like 128-bit counters; this implementation doesn't bother to solve
  63          * that problem.
  64          */
  65         int64_t         curobjs;
  66         int64_t         curbytes;
  67         uint64_t        accumobjs;
  68         uint64_t        accumbytes;
  69 };
  70
  71 struct prof_thr_cnt_s {
  72         /* Linkage into prof_ctx_t's cnts_ql. */
  73         ql_elm(prof_thr_cnt_t)  cnts_link;
  74
  75         /* Linkage into thread's LRU. */
  76         ql_elm(prof_thr_cnt_t)  lru_link;
  77
  78         /*
  79          * Associated context.  If a thread frees an object that it did not
  80          * allocate, it is possible that the context is not cached in the
  81          * thread's hash table, in which case it must be able to look up the
  82          * context, insert a new prof_thr_cnt_t into the thread's hash table,
  83          * and link it into the prof_ctx_t's cnts_ql.
  84          */
  85         prof_ctx_t              *ctx;
  86
  87         /*
  88          * Threads use memory barriers to update the counters.  Since there is
  89          * only ever one writer, the only challenge is for the reader to get a
  90          * consistent read of the counters.
  91          *
  92          * The writer uses this series of operations:
  93          *
  94          * 1) Increment epoch to an odd number.
  95          * 2) Update counters.
  96          * 3) Increment epoch to an even number.
  97          *
  98          * The reader must assure 1) that the epoch is even while it reads the
  99          * counters, and 2) that the epoch doesn't change between the time it
 100          * starts and finishes reading the counters.
 101          */
 102         unsigned                epoch;
 103
 104         /* Profiling counters. */
 105         prof_cnt_t              cnts;
 106 };
 107
 108 struct prof_ctx_s {
 109         /* Associated backtrace. */
 110         prof_bt_t               *bt;
 111
 112         /* Protects cnt_merged and cnts_ql. */
 113         malloc_mutex_t          lock;
 114
 115         /* Temporary storage for summation during dump. */
 116         prof_cnt_t              cnt_summed;
 117
 118         /* When threads exit, they merge their stats into cnt_merged. */
 119         prof_cnt_t              cnt_merged;
 120
 121         /*
 122          * List of profile counters, one for each thread that has allocated in
 123          * this context.
 124          */
 125         ql_head(prof_thr_cnt_t) cnts_ql;
 126 };
 127
 128 struct prof_tdata_s {
 129         /*
 130          * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
 131          * cache of backtraces, with associated thread-specific prof_thr_cnt_t
 132          * objects.  Other threads may read the prof_thr_cnt_t contents, but no
 133          * others will ever write them.
 134          *
 135          * Upon thread exit, the thread must merge all the prof_thr_cnt_t
 136          * counter data into the associated prof_ctx_t objects, and unlink/free
 137          * the prof_thr_cnt_t objects.
 138          */
 139         ckh_t                   bt2cnt;
 140
 141         /* LRU for contents of bt2cnt. */
 142         ql_head(prof_thr_cnt_t) lru_ql;
 143
 144         /* Backtrace vector, used for calls to prof_backtrace(). */
 145         void                    **vec;
 146
 147         /* Sampling state. */
 148         uint64_t                prn_state;
 149         uint64_t                threshold;
 150         uint64_t                accum;
 151 };
 152
 153 #endif /* JEMALLOC_H_STRUCTS */
 154 /******************************************************************************/
 155 #ifdef JEMALLOC_H_EXTERNS
 156
 157 extern bool     opt_prof;
 158 /*
 159  * Even if opt_prof is true, sampling can be temporarily disabled by setting
 160  * opt_prof_active to false.  No locking is used when updating opt_prof_active,
 161  * so there are no guarantees regarding how long it will take for all threads
 162  * to notice state changes.
 163  */
 164 extern bool     opt_prof_active;
 165 extern size_t   opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
 166 extern size_t   opt_lg_prof_sample;   /* Mean bytes between samples. */
 167 extern ssize_t  opt_lg_prof_interval; /* lg(prof_interval). */
 168 extern bool     opt_prof_gdump;       /* High-water memory dumping. */
 169 extern bool     opt_prof_leak;        /* Dump leak summary at exit. */
 170 extern bool     opt_prof_accum;       /* Report cumulative bytes. */
 171 extern ssize_t  opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
 172 extern char     opt_prof_prefix[PATH_MAX + 1];
 173
 174 /*
 175  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
 176  * profile dump when it reaches this threshold.  The effect is that the
 177  * interval between profile dumps averages prof_interval, though the actual
 178  * interval between dumps will tend to be sporadic, and the interval will be a
 179  * maximum of approximately (prof_interval * narenas).
 180  */
 181 extern uint64_t prof_interval;
 182
 183 /*
 184  * If true, promote small sampled objects to large objects, since small run
 185  * headers do not have embedded profile context pointers.
 186  */
 187 extern bool     prof_promote;
 188
 189 /* (1U << opt_lg_prof_bt_max). */
 190 extern unsigned prof_bt_max;
 191
 192 /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
 193 #ifndef NO_TLS
 194 extern __thread prof_tdata_t    *prof_tdata_tls
 195     JEMALLOC_ATTR(tls_model("initial-exec"));
 196 #  define PROF_TCACHE_GET()     prof_tdata_tls
 197 #  define PROF_TCACHE_SET(v)    do {                                    \
 198         prof_tdata_tls = (v);                                           \
 199         pthread_setspecific(prof_tdata_tsd, (void *)(v));               \
 200 } while (0)
 201 #else
 202 #  define PROF_TCACHE_GET()                                             \
 203         ((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
 204 #  define PROF_TCACHE_SET(v)    do {                                    \
 205         pthread_setspecific(prof_tdata_tsd, (void *)(v));               \
 206 } while (0)
 207 #endif
 208 /*
 209  * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
 210  * called when a thread exits, so that prof_tdata_tls contents can be merged,
 211  * unlinked, and deallocated.
 212  */
 213 extern pthread_key_t    prof_tdata_tsd;
 214
 215 void    bt_init(prof_bt_t *bt, void **vec);
 216 void    prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
 217 prof_thr_cnt_t  *prof_lookup(prof_bt_t *bt);
 218 void    prof_idump(void);
 219 bool    prof_mdump(const char *filename);
 220 void    prof_gdump(void);
 221 prof_tdata_t    *prof_tdata_init(void);
 222 void    prof_boot0(void);
 223 void    prof_boot1(void);
 224 bool    prof_boot2(void);
 225
 226 #endif /* JEMALLOC_H_EXTERNS */
 227 /******************************************************************************/
 228 #ifdef JEMALLOC_H_INLINES
 229
 230 #ifndef JEMALLOC_ENABLE_INLINE
 231 void    prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 232 prof_thr_cnt_t  *prof_alloc_prep(size_t size);
 233 prof_ctx_t      *prof_ctx_get(const void *ptr);
 234 void    prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 235 bool    prof_sample_accum_update(size_t size);
 236 void    prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
 237 void    prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 238     size_t old_size, prof_ctx_t *old_ctx);
 239 void    prof_free(const void *ptr, size_t size);
 240 #endif
 241
 242 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
 243 JEMALLOC_INLINE void
 244 prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 245 {
 246         uint64_t r;
 247         double u;
 248
 249         /*
 250          * Compute sample threshold as a geometrically distributed random
 251          * variable with mean (2^opt_lg_prof_sample).
 252          *
 253          *                         __        __
 254          *                         |  log(u)  |                     1
 255          * prof_tdata->threshold = | -------- |, where p = -------------------
 256          *                         | log(1-p) |             opt_lg_prof_sample
 257          *                                                 2
 258          *
 259          * For more information on the math, see:
 260          *
 261          *   Non-Uniform Random Variate Generation
 262          *   Luc Devroye
 263          *   Springer-Verlag, New York, 1986
 264          *   pp 500
 265          *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
 266          */
 267         prn64(r, 53, prof_tdata->prn_state,
 268             (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
 269         u = (double)r * (1.0/9007199254740992.0L);
 270         prof_tdata->threshold = (uint64_t)(log(u) /
 271             log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
 272             + (uint64_t)1U;
 273 }
 274
 275 JEMALLOC_INLINE prof_thr_cnt_t *
 276 prof_alloc_prep(size_t size)
 277 {
 278 #ifdef JEMALLOC_ENABLE_INLINE
 279    /* This function does not have its own stack frame, because it is inlined. */
 280 #  define NIGNORE 1
 281 #else
 282 #  define NIGNORE 2
 283 #endif
 284         prof_thr_cnt_t *ret;
 285         prof_tdata_t *prof_tdata;
 286         prof_bt_t bt;
 287
 288         assert(size == s2u(size));
 289
 290         prof_tdata = PROF_TCACHE_GET();
 291         if (prof_tdata == NULL) {
 292                 prof_tdata = prof_tdata_init();
 293                 if (prof_tdata == NULL)
 294                         return (NULL);
 295         }
 296
 297         if (opt_prof_active == false) {
 298                 /* Sampling is currently inactive, so avoid sampling. */
 299                 ret = (prof_thr_cnt_t *)(uintptr_t)1U;
 300         } else if (opt_lg_prof_sample == 0) {
 301                 /*
 302                  * Don't bother with sampling logic, since sampling interval is
 303                  * 1.
 304                  */
 305                 bt_init(&bt, prof_tdata->vec);
 306                 prof_backtrace(&bt, NIGNORE, prof_bt_max);
 307                 ret = prof_lookup(&bt);
 308         } else {
 309                 if (prof_tdata->threshold == 0) {
 310                         /*
 311                          * Initialize.  Seed the prng differently for each
 312                          * thread.
 313                          */
 314                         prof_tdata->prn_state = (uint64_t)(uintptr_t)&size;
 315                         prof_sample_threshold_update(prof_tdata);
 316                 }
 317
 318                 /*
 319                  * Determine whether to capture a backtrace based on whether
 320                  * size is enough for prof_accum to reach
 321                  * prof_tdata->threshold.  However, delay updating these
 322                  * variables until prof_{m,re}alloc(), because we don't know
 323                  * for sure that the allocation will succeed.
 324                  *
 325                  * Use subtraction rather than addition to avoid potential
 326                  * integer overflow.
 327                  */
 328                 if (size >= prof_tdata->threshold - prof_tdata->accum) {
 329                         bt_init(&bt, prof_tdata->vec);
 330                         prof_backtrace(&bt, NIGNORE, prof_bt_max);
 331                         ret = prof_lookup(&bt);
 332                 } else
 333                         ret = (prof_thr_cnt_t *)(uintptr_t)1U;
 334         }
 335
 336         return (ret);
 337 #undef NIGNORE
 338 }
 339
 340 JEMALLOC_INLINE prof_ctx_t *
 341 prof_ctx_get(const void *ptr)
 342 {
 343         prof_ctx_t *ret;
 344         arena_chunk_t *chunk;
 345
 346         assert(ptr != NULL);
 347
 348         chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 349         if (chunk != ptr) {
 350                 /* Region. */
 351                 dassert(chunk->arena->magic == ARENA_MAGIC);
 352
 353                 ret = arena_prof_ctx_get(ptr);
 354         } else
 355                 ret = huge_prof_ctx_get(ptr);
 356
 357         return (ret);
 358 }
 359
 360 JEMALLOC_INLINE void
 361 prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 362 {
 363         arena_chunk_t *chunk;
 364
 365         assert(ptr != NULL);
 366
 367         chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 368         if (chunk != ptr) {
 369                 /* Region. */
 370                 dassert(chunk->arena->magic == ARENA_MAGIC);
 371
 372                 arena_prof_ctx_set(ptr, ctx);
 373         } else
 374                 huge_prof_ctx_set(ptr, ctx);
 375 }
 376
 377 JEMALLOC_INLINE bool
 378 prof_sample_accum_update(size_t size)
 379 {
 380         prof_tdata_t *prof_tdata;
 381
 382         /* Sampling logic is unnecessary if the interval is 1. */
 383         assert(opt_lg_prof_sample != 0);
 384
 385         prof_tdata = PROF_TCACHE_GET();
 386         assert(prof_tdata != NULL);
 387
 388         /* Take care to avoid integer overflow. */
 389         if (size >= prof_tdata->threshold - prof_tdata->accum) {
 390                 prof_tdata->accum -= (prof_tdata->threshold - size);
 391                 /* Compute new sample threshold. */
 392                 prof_sample_threshold_update(prof_tdata);
 393                 while (prof_tdata->accum >= prof_tdata->threshold) {
 394                         prof_tdata->accum -= prof_tdata->threshold;
 395                         prof_sample_threshold_update(prof_tdata);
 396                 }
 397                 return (false);
 398         } else {
 399                 prof_tdata->accum += size;
 400                 return (true);
 401         }
 402 }
 403
 404 JEMALLOC_INLINE void
 405 prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 406 {
 407
 408         assert(ptr != NULL);
 409         assert(size == isalloc(ptr));
 410
 411         if (opt_lg_prof_sample != 0) {
 412                 if (prof_sample_accum_update(size)) {
 413                         /*
 414                          * Don't sample.  For malloc()-like allocation, it is
 415                          * always possible to tell in advance how large an
 416                          * object's usable size will be, so there should never
 417                          * be a difference between the size passed to
 418                          * prof_alloc_prep() and prof_malloc().
 419                          */
 420                         assert((uintptr_t)cnt == (uintptr_t)1U);
 421                 }
 422         }
 423
 424         if ((uintptr_t)cnt > (uintptr_t)1U) {
 425                 prof_ctx_set(ptr, cnt->ctx);
 426
 427                 cnt->epoch++;
 428                 /*********/
 429                 mb_write();
 430                 /*********/
 431                 cnt->cnts.curobjs++;
 432                 cnt->cnts.curbytes += size;
 433                 if (opt_prof_accum) {
 434                         cnt->cnts.accumobjs++;
 435                         cnt->cnts.accumbytes += size;
 436                 }
 437                 /*********/
 438                 mb_write();
 439                 /*********/
 440                 cnt->epoch++;
 441                 /*********/
 442                 mb_write();
 443                 /*********/
 444         } else
 445                 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 446 }
 447
 448 JEMALLOC_INLINE void
 449 prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 450     size_t old_size, prof_ctx_t *old_ctx)
 451 {
 452         prof_thr_cnt_t *told_cnt;
 453
 454         assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
 455
 456         if (ptr != NULL) {
 457                 assert(size == isalloc(ptr));
 458                 if (opt_lg_prof_sample != 0) {
 459                         if (prof_sample_accum_update(size)) {
 460                                 /*
 461                                  * Don't sample.  The size passed to
 462                                  * prof_alloc_prep() was larger than what
 463                                  * actually got allocated, so a backtrace was
 464                                  * captured for this allocation, even though
 465                                  * its actual size was insufficient to cross
 466                                  * the sample threshold.
 467                                  */
 468                                 cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 469                         }
 470                 }
 471         }
 472
 473         if ((uintptr_t)old_ctx > (uintptr_t)1U) {
 474                 told_cnt = prof_lookup(old_ctx->bt);
 475                 if (told_cnt == NULL) {
 476                         /*
 477                          * It's too late to propagate OOM for this realloc(),
 478                          * so operate directly on old_cnt->ctx->cnt_merged.
 479                          */
 480                         malloc_mutex_lock(&old_ctx->lock);
 481                         old_ctx->cnt_merged.curobjs--;
 482                         old_ctx->cnt_merged.curbytes -= old_size;
 483                         malloc_mutex_unlock(&old_ctx->lock);
 484                         told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 485                 }
 486         } else
 487                 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 488
 489         if ((uintptr_t)told_cnt > (uintptr_t)1U)
 490                 told_cnt->epoch++;
 491         if ((uintptr_t)cnt > (uintptr_t)1U) {
 492                 prof_ctx_set(ptr, cnt->ctx);
 493                 cnt->epoch++;
 494         } else
 495                 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 496         /*********/
 497         mb_write();
 498         /*********/
 499         if ((uintptr_t)told_cnt > (uintptr_t)1U) {
 500                 told_cnt->cnts.curobjs--;
 501                 told_cnt->cnts.curbytes -= old_size;
 502         }
 503         if ((uintptr_t)cnt > (uintptr_t)1U) {
 504                 cnt->cnts.curobjs++;
 505                 cnt->cnts.curbytes += size;
 506                 if (opt_prof_accum) {
 507                         cnt->cnts.accumobjs++;
 508                         cnt->cnts.accumbytes += size;
 509                 }
 510         }
 511         /*********/
 512         mb_write();
 513         /*********/
 514         if ((uintptr_t)told_cnt > (uintptr_t)1U)
 515                 told_cnt->epoch++;
 516         if ((uintptr_t)cnt > (uintptr_t)1U)
 517                 cnt->epoch++;
 518         /*********/
 519         mb_write(); /* Not strictly necessary. */
 520 }
 521
 522 JEMALLOC_INLINE void
 523 prof_free(const void *ptr, size_t size)
 524 {
 525         prof_ctx_t *ctx = prof_ctx_get(ptr);
 526
 527         if ((uintptr_t)ctx > (uintptr_t)1) {
 528                 assert(size == isalloc(ptr));
 529                 prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
 530
 531                 if (tcnt != NULL) {
 532                         tcnt->epoch++;
 533                         /*********/
 534                         mb_write();
 535                         /*********/
 536                         tcnt->cnts.curobjs--;
 537                         tcnt->cnts.curbytes -= size;
 538                         /*********/
 539                         mb_write();
 540                         /*********/
 541                         tcnt->epoch++;
 542                         /*********/
 543                         mb_write();
 544                         /*********/
 545                 } else {
 546                         /*
 547                          * OOM during free() cannot be propagated, so operate
 548                          * directly on cnt->ctx->cnt_merged.
 549                          */
 550                         malloc_mutex_lock(&ctx->lock);
 551                         ctx->cnt_merged.curobjs--;
 552                         ctx->cnt_merged.curbytes -= size;
 553                         malloc_mutex_unlock(&ctx->lock);
 554                 }
 555         }
 556 }
 557 #endif
 558
 559 #endif /* JEMALLOC_H_INLINES */
 560 /******************************************************************************/
 561 #endif /* JEMALLOC_PROF */