deps/jemalloc.orig/include/jemalloc/internal/prof.h

   1 #ifdef JEMALLOC_PROF
   2 /******************************************************************************/
   3 #ifdef JEMALLOC_H_TYPES
   4
   5 typedef struct prof_bt_s prof_bt_t;
   6 typedef struct prof_cnt_s prof_cnt_t;
   7 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
   8 typedef struct prof_ctx_s prof_ctx_t;
   9 typedef struct prof_tdata_s prof_tdata_t;
  10
  11 /* Option defaults. */
  12 #define PROF_PREFIX_DEFAULT             "jeprof"
  13 #define LG_PROF_BT_MAX_DEFAULT          7
  14 #define LG_PROF_SAMPLE_DEFAULT          0
  15 #define LG_PROF_INTERVAL_DEFAULT        -1
  16 #define LG_PROF_TCMAX_DEFAULT           -1
  17
  18 /*
  19  * Hard limit on stack backtrace depth.  Note that the version of
  20  * prof_backtrace() that is based on __builtin_return_address() necessarily has
  21  * a hard-coded number of backtrace frame handlers.
  22  */
  23 #if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
  24 #  define LG_PROF_BT_MAX        ((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
  25 #else
  26 #  define LG_PROF_BT_MAX        7 /* >= LG_PROF_BT_MAX_DEFAULT */
  27 #endif
  28 #define PROF_BT_MAX             (1U << LG_PROF_BT_MAX)
  29
  30 /* Initial hash table size. */
  31 #define PROF_CKH_MINITEMS       64
  32
  33 /* Size of memory buffer to use when writing dump files. */
  34 #define PROF_DUMP_BUF_SIZE      65536
  35
  36 #endif /* JEMALLOC_H_TYPES */
  37 /******************************************************************************/
  38 #ifdef JEMALLOC_H_STRUCTS
  39
  40 struct prof_bt_s {
  41         /* Backtrace, stored as len program counters. */
  42         void            **vec;
  43         unsigned        len;
  44 };
  45
  46 #ifdef JEMALLOC_PROF_LIBGCC
  47 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
  48 typedef struct {
  49         prof_bt_t       *bt;
  50         unsigned        nignore;
  51         unsigned        max;
  52 } prof_unwind_data_t;
  53 #endif
  54
  55 struct prof_cnt_s {
  56         /*
  57          * Profiling counters.  An allocation/deallocation pair can operate on
  58          * different prof_thr_cnt_t objects that are linked into the same
  59          * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
  60          * negative.  In principle it is possible for the *bytes counters to
  61          * overflow/underflow, but a general solution would require something
  62          * like 128-bit counters; this implementation doesn't bother to solve
  63          * that problem.
  64          */
  65         int64_t         curobjs;
  66         int64_t         curbytes;
  67         uint64_t        accumobjs;
  68         uint64_t        accumbytes;
  69 };
  70
  71 struct prof_thr_cnt_s {
  72         /* Linkage into prof_ctx_t's cnts_ql. */
  73         ql_elm(prof_thr_cnt_t)  cnts_link;
  74
  75         /* Linkage into thread's LRU. */
  76         ql_elm(prof_thr_cnt_t)  lru_link;
  77
  78         /*
  79          * Associated context.  If a thread frees an object that it did not
  80          * allocate, it is possible that the context is not cached in the
  81          * thread's hash table, in which case it must be able to look up the
  82          * context, insert a new prof_thr_cnt_t into the thread's hash table,
  83          * and link it into the prof_ctx_t's cnts_ql.
  84          */
  85         prof_ctx_t              *ctx;
  86
  87         /*
  88          * Threads use memory barriers to update the counters.  Since there is
  89          * only ever one writer, the only challenge is for the reader to get a
  90          * consistent read of the counters.
  91          *
  92          * The writer uses this series of operations:
  93          *
  94          * 1) Increment epoch to an odd number.
  95          * 2) Update counters.
  96          * 3) Increment epoch to an even number.
  97          *
  98          * The reader must assure 1) that the epoch is even while it reads the
  99          * counters, and 2) that the epoch doesn't change between the time it
 100          * starts and finishes reading the counters.
 101          */
 102         unsigned                epoch;
 103
 104         /* Profiling counters. */
 105         prof_cnt_t              cnts;
 106 };
 107
 108 struct prof_ctx_s {
 109         /* Associated backtrace. */
 110         prof_bt_t               *bt;
 111
 112         /* Protects cnt_merged and cnts_ql. */
 113         malloc_mutex_t          lock;
 114
 115         /* Temporary storage for summation during dump. */
 116         prof_cnt_t              cnt_summed;
 117
 118         /* When threads exit, they merge their stats into cnt_merged. */
 119         prof_cnt_t              cnt_merged;
 120
 121         /*
 122          * List of profile counters, one for each thread that has allocated in
 123          * this context.
 124          */
 125         ql_head(prof_thr_cnt_t) cnts_ql;
 126 };
 127
 128 struct prof_tdata_s {
 129         /*
 130          * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
 131          * cache of backtraces, with associated thread-specific prof_thr_cnt_t
 132          * objects.  Other threads may read the prof_thr_cnt_t contents, but no
 133          * others will ever write them.
 134          *
 135          * Upon thread exit, the thread must merge all the prof_thr_cnt_t
 136          * counter data into the associated prof_ctx_t objects, and unlink/free
 137          * the prof_thr_cnt_t objects.
 138          */
 139         ckh_t                   bt2cnt;
 140
 141         /* LRU for contents of bt2cnt. */
 142         ql_head(prof_thr_cnt_t) lru_ql;
 143
 144         /* Backtrace vector, used for calls to prof_backtrace(). */
 145         void                    **vec;
 146
 147         /* Sampling state. */
 148         uint64_t                prn_state;
 149         uint64_t                threshold;
 150         uint64_t                accum;
 151 };
 152
 153 #endif /* JEMALLOC_H_STRUCTS */
 154 /******************************************************************************/
 155 #ifdef JEMALLOC_H_EXTERNS
 156
 157 extern bool     opt_prof;
 158 /*
 159  * Even if opt_prof is true, sampling can be temporarily disabled by setting
 160  * opt_prof_active to false.  No locking is used when updating opt_prof_active,
 161  * so there are no guarantees regarding how long it will take for all threads
 162  * to notice state changes.
 163  */
 164 extern bool     opt_prof_active;
 165 extern size_t   opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
 166 extern size_t   opt_lg_prof_sample;   /* Mean bytes between samples. */
 167 extern ssize_t  opt_lg_prof_interval; /* lg(prof_interval). */
 168 extern bool     opt_prof_gdump;       /* High-water memory dumping. */
 169 extern bool     opt_prof_leak;        /* Dump leak summary at exit. */
 170 extern bool     opt_prof_accum;       /* Report cumulative bytes. */
 171 extern ssize_t  opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
 172 extern char     opt_prof_prefix[PATH_MAX + 1];
 173
 174 /*
 175  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
 176  * profile dump when it reaches this threshold.  The effect is that the
 177  * interval between profile dumps averages prof_interval, though the actual
 178  * interval between dumps will tend to be sporadic, and the interval will be a
 179  * maximum of approximately (prof_interval * narenas).
 180  */
 181 extern uint64_t prof_interval;
 182
 183 /*
 184  * If true, promote small sampled objects to large objects, since small run
 185  * headers do not have embedded profile context pointers.
 186  */
 187 extern bool     prof_promote;
 188
 189 /* (1U << opt_lg_prof_bt_max). */
 190 extern unsigned prof_bt_max;
 191
 192 /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
 193 #ifndef NO_TLS
 194 extern __thread prof_tdata_t    *prof_tdata_tls
 195     JEMALLOC_ATTR(tls_model("initial-exec"));
 196 #  define PROF_TCACHE_GET()     prof_tdata_tls
 197 #  define PROF_TCACHE_SET(v)    do {                                    \
 198         prof_tdata_tls = (v);                                           \
 199         pthread_setspecific(prof_tdata_tsd, (void *)(v));               \
 200 } while (0)
 201 #else
 202 #  define PROF_TCACHE_GET()                                             \
 203         ((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
 204 #  define PROF_TCACHE_SET(v)    do {                                    \
 205         pthread_setspecific(prof_tdata_tsd, (void *)(v));               \
 206 } while (0)
 207 #endif
 208 /*
 209  * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
 210  * called when a thread exits, so that prof_tdata_tls contents can be merged,
 211  * unlinked, and deallocated.
 212  */
 213 extern pthread_key_t    prof_tdata_tsd;
 214
 215 void    bt_init(prof_bt_t *bt, void **vec);
 216 void    prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
 217 prof_thr_cnt_t  *prof_lookup(prof_bt_t *bt);
 218 void    prof_idump(void);
 219 bool    prof_mdump(const char *filename);
 220 void    prof_gdump(void);
 221 prof_tdata_t    *prof_tdata_init(void);
 222 void    prof_boot0(void);
 223 void    prof_boot1(void);
 224 bool    prof_boot2(void);
 225
 226 #endif /* JEMALLOC_H_EXTERNS */
 227 /******************************************************************************/
 228 #ifdef JEMALLOC_H_INLINES
 229
 230 #define PROF_ALLOC_PREP(nignore, size, ret) do {                        \
 231         prof_tdata_t *prof_tdata;                                       \
 232         prof_bt_t bt;                                                   \
 233                                                                         \
 234         assert(size == s2u(size));                                      \
 235                                                                         \
 236         prof_tdata = PROF_TCACHE_GET();                                 \
 237         if (prof_tdata == NULL) {                                       \
 238                 prof_tdata = prof_tdata_init();                         \
 239                 if (prof_tdata == NULL) {                               \
 240                         ret = NULL;                                     \
 241                         break;                                          \
 242                 }                                                       \
 243         }                                                               \
 244                                                                         \
 245         if (opt_prof_active == false) {                                 \
 246                 /* Sampling is currently inactive, so avoid sampling. */\
 247                 ret = (prof_thr_cnt_t *)(uintptr_t)1U;                  \
 248         } else if (opt_lg_prof_sample == 0) {                           \
 249                 /* Don't bother with sampling logic, since sampling   */\
 250                 /* interval is 1.                                     */\
 251                 bt_init(&bt, prof_tdata->vec);                          \
 252                 prof_backtrace(&bt, nignore, prof_bt_max);              \
 253                 ret = prof_lookup(&bt);                                 \
 254         } else {                                                        \
 255                 if (prof_tdata->threshold == 0) {                       \
 256                         /* Initialize.  Seed the prng differently for */\
 257                         /* each thread.                               */\
 258                         prof_tdata->prn_state =                         \
 259                             (uint64_t)(uintptr_t)&size;                 \
 260                         prof_sample_threshold_update(prof_tdata);       \
 261                 }                                                       \
 262                                                                         \
 263                 /* Determine whether to capture a backtrace based on  */\
 264                 /* whether size is enough for prof_accum to reach     */\
 265                 /* prof_tdata->threshold.  However, delay updating    */\
 266                 /* these variables until prof_{m,re}alloc(), because  */\
 267                 /* we don't know for sure that the allocation will    */\
 268                 /* succeed.                                           */\
 269                 /*                                                    */\
 270                 /* Use subtraction rather than addition to avoid      */\
 271                 /* potential integer overflow.                        */\
 272                 if (size >= prof_tdata->threshold -                     \
 273                     prof_tdata->accum) {                                \
 274                         bt_init(&bt, prof_tdata->vec);                  \
 275                         prof_backtrace(&bt, nignore, prof_bt_max);      \
 276                         ret = prof_lookup(&bt);                         \
 277                 } else                                                  \
 278                         ret = (prof_thr_cnt_t *)(uintptr_t)1U;          \
 279         }                                                               \
 280 } while (0)
 281
 282 #ifndef JEMALLOC_ENABLE_INLINE
 283 void    prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 284 prof_ctx_t      *prof_ctx_get(const void *ptr);
 285 void    prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 286 bool    prof_sample_accum_update(size_t size);
 287 void    prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
 288 void    prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 289     size_t old_size, prof_ctx_t *old_ctx);
 290 void    prof_free(const void *ptr, size_t size);
 291 #endif
 292
 293 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
 294 JEMALLOC_INLINE void
 295 prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 296 {
 297         uint64_t r;
 298         double u;
 299
 300         /*
 301          * Compute sample threshold as a geometrically distributed random
 302          * variable with mean (2^opt_lg_prof_sample).
 303          *
 304          *                         __        __
 305          *                         |  log(u)  |                     1
 306          * prof_tdata->threshold = | -------- |, where p = -------------------
 307          *                         | log(1-p) |             opt_lg_prof_sample
 308          *                                                 2
 309          *
 310          * For more information on the math, see:
 311          *
 312          *   Non-Uniform Random Variate Generation
 313          *   Luc Devroye
 314          *   Springer-Verlag, New York, 1986
 315          *   pp 500
 316          *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
 317          */
 318         prn64(r, 53, prof_tdata->prn_state,
 319             (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
 320         u = (double)r * (1.0/9007199254740992.0L);
 321         prof_tdata->threshold = (uint64_t)(log(u) /
 322             log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
 323             + (uint64_t)1U;
 324 }
 325
 326 JEMALLOC_INLINE prof_ctx_t *
 327 prof_ctx_get(const void *ptr)
 328 {
 329         prof_ctx_t *ret;
 330         arena_chunk_t *chunk;
 331
 332         assert(ptr != NULL);
 333
 334         chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 335         if (chunk != ptr) {
 336                 /* Region. */
 337                 dassert(chunk->arena->magic == ARENA_MAGIC);
 338
 339                 ret = arena_prof_ctx_get(ptr);
 340         } else
 341                 ret = huge_prof_ctx_get(ptr);
 342
 343         return (ret);
 344 }
 345
 346 JEMALLOC_INLINE void
 347 prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 348 {
 349         arena_chunk_t *chunk;
 350
 351         assert(ptr != NULL);
 352
 353         chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 354         if (chunk != ptr) {
 355                 /* Region. */
 356                 dassert(chunk->arena->magic == ARENA_MAGIC);
 357
 358                 arena_prof_ctx_set(ptr, ctx);
 359         } else
 360                 huge_prof_ctx_set(ptr, ctx);
 361 }
 362
 363 JEMALLOC_INLINE bool
 364 prof_sample_accum_update(size_t size)
 365 {
 366         prof_tdata_t *prof_tdata;
 367
 368         /* Sampling logic is unnecessary if the interval is 1. */
 369         assert(opt_lg_prof_sample != 0);
 370
 371         prof_tdata = PROF_TCACHE_GET();
 372         assert(prof_tdata != NULL);
 373
 374         /* Take care to avoid integer overflow. */
 375         if (size >= prof_tdata->threshold - prof_tdata->accum) {
 376                 prof_tdata->accum -= (prof_tdata->threshold - size);
 377                 /* Compute new sample threshold. */
 378                 prof_sample_threshold_update(prof_tdata);
 379                 while (prof_tdata->accum >= prof_tdata->threshold) {
 380                         prof_tdata->accum -= prof_tdata->threshold;
 381                         prof_sample_threshold_update(prof_tdata);
 382                 }
 383                 return (false);
 384         } else {
 385                 prof_tdata->accum += size;
 386                 return (true);
 387         }
 388 }
 389
 390 JEMALLOC_INLINE void
 391 prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 392 {
 393
 394         assert(ptr != NULL);
 395         assert(size == isalloc(ptr));
 396
 397         if (opt_lg_prof_sample != 0) {
 398                 if (prof_sample_accum_update(size)) {
 399                         /*
 400                          * Don't sample.  For malloc()-like allocation, it is
 401                          * always possible to tell in advance how large an
 402                          * object's usable size will be, so there should never
 403                          * be a difference between the size passed to
 404                          * PROF_ALLOC_PREP() and prof_malloc().
 405                          */
 406                         assert((uintptr_t)cnt == (uintptr_t)1U);
 407                 }
 408         }
 409
 410         if ((uintptr_t)cnt > (uintptr_t)1U) {
 411                 prof_ctx_set(ptr, cnt->ctx);
 412
 413                 cnt->epoch++;
 414                 /*********/
 415                 mb_write();
 416                 /*********/
 417                 cnt->cnts.curobjs++;
 418                 cnt->cnts.curbytes += size;
 419                 if (opt_prof_accum) {
 420                         cnt->cnts.accumobjs++;
 421                         cnt->cnts.accumbytes += size;
 422                 }
 423                 /*********/
 424                 mb_write();
 425                 /*********/
 426                 cnt->epoch++;
 427                 /*********/
 428                 mb_write();
 429                 /*********/
 430         } else
 431                 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 432 }
 433
 434 JEMALLOC_INLINE void
 435 prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 436     size_t old_size, prof_ctx_t *old_ctx)
 437 {
 438         prof_thr_cnt_t *told_cnt;
 439
 440         assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
 441
 442         if (ptr != NULL) {
 443                 assert(size == isalloc(ptr));
 444                 if (opt_lg_prof_sample != 0) {
 445                         if (prof_sample_accum_update(size)) {
 446                                 /*
 447                                  * Don't sample.  The size passed to
 448                                  * PROF_ALLOC_PREP() was larger than what
 449                                  * actually got allocated, so a backtrace was
 450                                  * captured for this allocation, even though
 451                                  * its actual size was insufficient to cross
 452                                  * the sample threshold.
 453                                  */
 454                                 cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 455                         }
 456                 }
 457         }
 458
 459         if ((uintptr_t)old_ctx > (uintptr_t)1U) {
 460                 told_cnt = prof_lookup(old_ctx->bt);
 461                 if (told_cnt == NULL) {
 462                         /*
 463                          * It's too late to propagate OOM for this realloc(),
 464                          * so operate directly on old_cnt->ctx->cnt_merged.
 465                          */
 466                         malloc_mutex_lock(&old_ctx->lock);
 467                         old_ctx->cnt_merged.curobjs--;
 468                         old_ctx->cnt_merged.curbytes -= old_size;
 469                         malloc_mutex_unlock(&old_ctx->lock);
 470                         told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 471                 }
 472         } else
 473                 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 474
 475         if ((uintptr_t)told_cnt > (uintptr_t)1U)
 476                 told_cnt->epoch++;
 477         if ((uintptr_t)cnt > (uintptr_t)1U) {
 478                 prof_ctx_set(ptr, cnt->ctx);
 479                 cnt->epoch++;
 480         } else
 481                 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 482         /*********/
 483         mb_write();
 484         /*********/
 485         if ((uintptr_t)told_cnt > (uintptr_t)1U) {
 486                 told_cnt->cnts.curobjs--;
 487                 told_cnt->cnts.curbytes -= old_size;
 488         }
 489         if ((uintptr_t)cnt > (uintptr_t)1U) {
 490                 cnt->cnts.curobjs++;
 491                 cnt->cnts.curbytes += size;
 492                 if (opt_prof_accum) {
 493                         cnt->cnts.accumobjs++;
 494                         cnt->cnts.accumbytes += size;
 495                 }
 496         }
 497         /*********/
 498         mb_write();
 499         /*********/
 500         if ((uintptr_t)told_cnt > (uintptr_t)1U)
 501                 told_cnt->epoch++;
 502         if ((uintptr_t)cnt > (uintptr_t)1U)
 503                 cnt->epoch++;
 504         /*********/
 505         mb_write(); /* Not strictly necessary. */
 506 }
 507
 508 JEMALLOC_INLINE void
 509 prof_free(const void *ptr, size_t size)
 510 {
 511         prof_ctx_t *ctx = prof_ctx_get(ptr);
 512
 513         if ((uintptr_t)ctx > (uintptr_t)1) {
 514                 assert(size == isalloc(ptr));
 515                 prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
 516
 517                 if (tcnt != NULL) {
 518                         tcnt->epoch++;
 519                         /*********/
 520                         mb_write();
 521                         /*********/
 522                         tcnt->cnts.curobjs--;
 523                         tcnt->cnts.curbytes -= size;
 524                         /*********/
 525                         mb_write();
 526                         /*********/
 527                         tcnt->epoch++;
 528                         /*********/
 529                         mb_write();
 530                         /*********/
 531                 } else {
 532                         /*
 533                          * OOM during free() cannot be propagated, so operate
 534                          * directly on cnt->ctx->cnt_merged.
 535                          */
 536                         malloc_mutex_lock(&ctx->lock);
 537                         ctx->cnt_merged.curobjs--;
 538                         ctx->cnt_merged.curbytes -= size;
 539                         malloc_mutex_unlock(&ctx->lock);
 540                 }
 541         }
 542 }
 543 #endif
 544
 545 #endif /* JEMALLOC_H_INLINES */
 546 /******************************************************************************/
 547 #endif /* JEMALLOC_PROF */