2 /******************************************************************************/
3 #ifdef JEMALLOC_H_TYPES
5 typedef struct prof_bt_s prof_bt_t
;
6 typedef struct prof_cnt_s prof_cnt_t
;
7 typedef struct prof_thr_cnt_s prof_thr_cnt_t
;
8 typedef struct prof_ctx_s prof_ctx_t
;
9 typedef struct prof_tdata_s prof_tdata_t
;
11 /* Option defaults. */
12 #define PROF_PREFIX_DEFAULT "jeprof"
13 #define LG_PROF_BT_MAX_DEFAULT 7
14 #define LG_PROF_SAMPLE_DEFAULT 0
15 #define LG_PROF_INTERVAL_DEFAULT -1
16 #define LG_PROF_TCMAX_DEFAULT -1
19 * Hard limit on stack backtrace depth. Note that the version of
20 * prof_backtrace() that is based on __builtin_return_address() necessarily has
21 * a hard-coded number of backtrace frame handlers.
23 #if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
24 # define LG_PROF_BT_MAX ((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
26 # define LG_PROF_BT_MAX 7 /* >= LG_PROF_BT_MAX_DEFAULT */
28 #define PROF_BT_MAX (1U << LG_PROF_BT_MAX)
30 /* Initial hash table size. */
31 #define PROF_CKH_MINITEMS 64
33 /* Size of memory buffer to use when writing dump files. */
34 #define PROF_DUMP_BUF_SIZE 65536
36 #endif /* JEMALLOC_H_TYPES */
37 /******************************************************************************/
38 #ifdef JEMALLOC_H_STRUCTS
41 /* Backtrace, stored as len program counters. */
46 #ifdef JEMALLOC_PROF_LIBGCC
47 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
57 * Profiling counters. An allocation/deallocation pair can operate on
58 * different prof_thr_cnt_t objects that are linked into the same
59 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
60 * negative. In principle it is possible for the *bytes counters to
61 * overflow/underflow, but a general solution would require something
62 * like 128-bit counters; this implementation doesn't bother to solve
71 struct prof_thr_cnt_s
{
72 /* Linkage into prof_ctx_t's cnts_ql. */
73 ql_elm(prof_thr_cnt_t
) cnts_link
;
75 /* Linkage into thread's LRU. */
76 ql_elm(prof_thr_cnt_t
) lru_link
;
79 * Associated context. If a thread frees an object that it did not
80 * allocate, it is possible that the context is not cached in the
81 * thread's hash table, in which case it must be able to look up the
82 * context, insert a new prof_thr_cnt_t into the thread's hash table,
83 * and link it into the prof_ctx_t's cnts_ql.
88 * Threads use memory barriers to update the counters. Since there is
89 * only ever one writer, the only challenge is for the reader to get a
90 * consistent read of the counters.
92 * The writer uses this series of operations:
94 * 1) Increment epoch to an odd number.
96 * 3) Increment epoch to an even number.
98 * The reader must assure 1) that the epoch is even while it reads the
99 * counters, and 2) that the epoch doesn't change between the time it
100 * starts and finishes reading the counters.
104 /* Profiling counters. */
109 /* Associated backtrace. */
112 /* Protects cnt_merged and cnts_ql. */
115 /* Temporary storage for summation during dump. */
116 prof_cnt_t cnt_summed
;
118 /* When threads exit, they merge their stats into cnt_merged. */
119 prof_cnt_t cnt_merged
;
122 * List of profile counters, one for each thread that has allocated in
125 ql_head(prof_thr_cnt_t
) cnts_ql
;
128 struct prof_tdata_s
{
130 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a
131 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
132 * objects. Other threads may read the prof_thr_cnt_t contents, but no
133 * others will ever write them.
135 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
136 * counter data into the associated prof_ctx_t objects, and unlink/free
137 * the prof_thr_cnt_t objects.
141 /* LRU for contents of bt2cnt. */
142 ql_head(prof_thr_cnt_t
) lru_ql
;
144 /* Backtrace vector, used for calls to prof_backtrace(). */
147 /* Sampling state. */
153 #endif /* JEMALLOC_H_STRUCTS */
154 /******************************************************************************/
155 #ifdef JEMALLOC_H_EXTERNS
157 extern bool opt_prof
;
159 * Even if opt_prof is true, sampling can be temporarily disabled by setting
160 * opt_prof_active to false. No locking is used when updating opt_prof_active,
161 * so there are no guarantees regarding how long it will take for all threads
162 * to notice state changes.
164 extern bool opt_prof_active
;
165 extern size_t opt_lg_prof_bt_max
; /* Maximum backtrace depth. */
166 extern size_t opt_lg_prof_sample
; /* Mean bytes between samples. */
167 extern ssize_t opt_lg_prof_interval
; /* lg(prof_interval). */
168 extern bool opt_prof_gdump
; /* High-water memory dumping. */
169 extern bool opt_prof_leak
; /* Dump leak summary at exit. */
170 extern bool opt_prof_accum
; /* Report cumulative bytes. */
171 extern ssize_t opt_lg_prof_tcmax
; /* lg(max per thread bactrace cache) */
172 extern char opt_prof_prefix
[PATH_MAX
+ 1];
175 * Profile dump interval, measured in bytes allocated. Each arena triggers a
176 * profile dump when it reaches this threshold. The effect is that the
177 * interval between profile dumps averages prof_interval, though the actual
178 * interval between dumps will tend to be sporadic, and the interval will be a
179 * maximum of approximately (prof_interval * narenas).
181 extern uint64_t prof_interval
;
184 * If true, promote small sampled objects to large objects, since small run
185 * headers do not have embedded profile context pointers.
187 extern bool prof_promote
;
189 /* (1U << opt_lg_prof_bt_max). */
190 extern unsigned prof_bt_max
;
192 /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
194 extern __thread prof_tdata_t
*prof_tdata_tls
195 JEMALLOC_ATTR(tls_model("initial-exec"));
196 # define PROF_TCACHE_GET() prof_tdata_tls
197 # define PROF_TCACHE_SET(v) do { \
198 prof_tdata_tls = (v); \
199 pthread_setspecific(prof_tdata_tsd, (void *)(v)); \
202 # define PROF_TCACHE_GET() \
203 ((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
204 # define PROF_TCACHE_SET(v) do { \
205 pthread_setspecific(prof_tdata_tsd, (void *)(v)); \
209 * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
210 * called when a thread exits, so that prof_tdata_tls contents can be merged,
211 * unlinked, and deallocated.
213 extern pthread_key_t prof_tdata_tsd
;
215 void bt_init(prof_bt_t
*bt
, void **vec
);
216 void prof_backtrace(prof_bt_t
*bt
, unsigned nignore
, unsigned max
);
217 prof_thr_cnt_t
*prof_lookup(prof_bt_t
*bt
);
218 void prof_idump(void);
219 bool prof_mdump(const char *filename
);
220 void prof_gdump(void);
221 prof_tdata_t
*prof_tdata_init(void);
222 void prof_boot0(void);
223 void prof_boot1(void);
224 bool prof_boot2(void);
226 #endif /* JEMALLOC_H_EXTERNS */
227 /******************************************************************************/
228 #ifdef JEMALLOC_H_INLINES
230 #define PROF_ALLOC_PREP(nignore, size, ret) do { \
231 prof_tdata_t *prof_tdata; \
234 assert(size == s2u(size)); \
236 prof_tdata = PROF_TCACHE_GET(); \
237 if (prof_tdata == NULL) { \
238 prof_tdata = prof_tdata_init(); \
239 if (prof_tdata == NULL) { \
245 if (opt_prof_active == false) { \
246 /* Sampling is currently inactive, so avoid sampling. */\
247 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
248 } else if (opt_lg_prof_sample == 0) { \
249 /* Don't bother with sampling logic, since sampling */\
250 /* interval is 1. */\
251 bt_init(&bt, prof_tdata->vec); \
252 prof_backtrace(&bt, nignore, prof_bt_max); \
253 ret = prof_lookup(&bt); \
255 if (prof_tdata->threshold == 0) { \
256 /* Initialize. Seed the prng differently for */\
258 prof_tdata->prn_state = \
259 (uint64_t)(uintptr_t)&size; \
260 prof_sample_threshold_update(prof_tdata); \
263 /* Determine whether to capture a backtrace based on */\
264 /* whether size is enough for prof_accum to reach */\
265 /* prof_tdata->threshold. However, delay updating */\
266 /* these variables until prof_{m,re}alloc(), because */\
267 /* we don't know for sure that the allocation will */\
270 /* Use subtraction rather than addition to avoid */\
271 /* potential integer overflow. */\
272 if (size >= prof_tdata->threshold - \
273 prof_tdata->accum) { \
274 bt_init(&bt, prof_tdata->vec); \
275 prof_backtrace(&bt, nignore, prof_bt_max); \
276 ret = prof_lookup(&bt); \
278 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
282 #ifndef JEMALLOC_ENABLE_INLINE
283 void prof_sample_threshold_update(prof_tdata_t
*prof_tdata
);
284 prof_ctx_t
*prof_ctx_get(const void *ptr
);
285 void prof_ctx_set(const void *ptr
, prof_ctx_t
*ctx
);
286 bool prof_sample_accum_update(size_t size
);
287 void prof_malloc(const void *ptr
, size_t size
, prof_thr_cnt_t
*cnt
);
288 void prof_realloc(const void *ptr
, size_t size
, prof_thr_cnt_t
*cnt
,
289 size_t old_size
, prof_ctx_t
*old_ctx
);
290 void prof_free(const void *ptr
, size_t size
);
293 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
295 prof_sample_threshold_update(prof_tdata_t
*prof_tdata
)
301 * Compute sample threshold as a geometrically distributed random
302 * variable with mean (2^opt_lg_prof_sample).
306 * prof_tdata->threshold = | -------- |, where p = -------------------
307 * | log(1-p) | opt_lg_prof_sample
310 * For more information on the math, see:
312 * Non-Uniform Random Variate Generation
314 * Springer-Verlag, New York, 1986
316 * (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
318 prn64(r
, 53, prof_tdata
->prn_state
,
319 (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
320 u
= (double)r
* (1.0/9007199254740992.0L);
321 prof_tdata
->threshold
= (uint64_t)(log(u
) /
322 log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample
))))
326 JEMALLOC_INLINE prof_ctx_t
*
327 prof_ctx_get(const void *ptr
)
330 arena_chunk_t
*chunk
;
334 chunk
= (arena_chunk_t
*)CHUNK_ADDR2BASE(ptr
);
337 dassert(chunk
->arena
->magic
== ARENA_MAGIC
);
339 ret
= arena_prof_ctx_get(ptr
);
341 ret
= huge_prof_ctx_get(ptr
);
347 prof_ctx_set(const void *ptr
, prof_ctx_t
*ctx
)
349 arena_chunk_t
*chunk
;
353 chunk
= (arena_chunk_t
*)CHUNK_ADDR2BASE(ptr
);
356 dassert(chunk
->arena
->magic
== ARENA_MAGIC
);
358 arena_prof_ctx_set(ptr
, ctx
);
360 huge_prof_ctx_set(ptr
, ctx
);
364 prof_sample_accum_update(size_t size
)
366 prof_tdata_t
*prof_tdata
;
368 /* Sampling logic is unnecessary if the interval is 1. */
369 assert(opt_lg_prof_sample
!= 0);
371 prof_tdata
= PROF_TCACHE_GET();
372 assert(prof_tdata
!= NULL
);
374 /* Take care to avoid integer overflow. */
375 if (size
>= prof_tdata
->threshold
- prof_tdata
->accum
) {
376 prof_tdata
->accum
-= (prof_tdata
->threshold
- size
);
377 /* Compute new sample threshold. */
378 prof_sample_threshold_update(prof_tdata
);
379 while (prof_tdata
->accum
>= prof_tdata
->threshold
) {
380 prof_tdata
->accum
-= prof_tdata
->threshold
;
381 prof_sample_threshold_update(prof_tdata
);
385 prof_tdata
->accum
+= size
;
391 prof_malloc(const void *ptr
, size_t size
, prof_thr_cnt_t
*cnt
)
395 assert(size
== isalloc(ptr
));
397 if (opt_lg_prof_sample
!= 0) {
398 if (prof_sample_accum_update(size
)) {
400 * Don't sample. For malloc()-like allocation, it is
401 * always possible to tell in advance how large an
402 * object's usable size will be, so there should never
403 * be a difference between the size passed to
404 * PROF_ALLOC_PREP() and prof_malloc().
406 assert((uintptr_t)cnt
== (uintptr_t)1U);
410 if ((uintptr_t)cnt
> (uintptr_t)1U) {
411 prof_ctx_set(ptr
, cnt
->ctx
);
418 cnt
->cnts
.curbytes
+= size
;
419 if (opt_prof_accum
) {
420 cnt
->cnts
.accumobjs
++;
421 cnt
->cnts
.accumbytes
+= size
;
431 prof_ctx_set(ptr
, (prof_ctx_t
*)(uintptr_t)1U);
435 prof_realloc(const void *ptr
, size_t size
, prof_thr_cnt_t
*cnt
,
436 size_t old_size
, prof_ctx_t
*old_ctx
)
438 prof_thr_cnt_t
*told_cnt
;
440 assert(ptr
!= NULL
|| (uintptr_t)cnt
<= (uintptr_t)1U);
443 assert(size
== isalloc(ptr
));
444 if (opt_lg_prof_sample
!= 0) {
445 if (prof_sample_accum_update(size
)) {
447 * Don't sample. The size passed to
448 * PROF_ALLOC_PREP() was larger than what
449 * actually got allocated, so a backtrace was
450 * captured for this allocation, even though
451 * its actual size was insufficient to cross
452 * the sample threshold.
454 cnt
= (prof_thr_cnt_t
*)(uintptr_t)1U;
459 if ((uintptr_t)old_ctx
> (uintptr_t)1U) {
460 told_cnt
= prof_lookup(old_ctx
->bt
);
461 if (told_cnt
== NULL
) {
463 * It's too late to propagate OOM for this realloc(),
464 * so operate directly on old_cnt->ctx->cnt_merged.
466 malloc_mutex_lock(&old_ctx
->lock
);
467 old_ctx
->cnt_merged
.curobjs
--;
468 old_ctx
->cnt_merged
.curbytes
-= old_size
;
469 malloc_mutex_unlock(&old_ctx
->lock
);
470 told_cnt
= (prof_thr_cnt_t
*)(uintptr_t)1U;
473 told_cnt
= (prof_thr_cnt_t
*)(uintptr_t)1U;
475 if ((uintptr_t)told_cnt
> (uintptr_t)1U)
477 if ((uintptr_t)cnt
> (uintptr_t)1U) {
478 prof_ctx_set(ptr
, cnt
->ctx
);
481 prof_ctx_set(ptr
, (prof_ctx_t
*)(uintptr_t)1U);
485 if ((uintptr_t)told_cnt
> (uintptr_t)1U) {
486 told_cnt
->cnts
.curobjs
--;
487 told_cnt
->cnts
.curbytes
-= old_size
;
489 if ((uintptr_t)cnt
> (uintptr_t)1U) {
491 cnt
->cnts
.curbytes
+= size
;
492 if (opt_prof_accum
) {
493 cnt
->cnts
.accumobjs
++;
494 cnt
->cnts
.accumbytes
+= size
;
500 if ((uintptr_t)told_cnt
> (uintptr_t)1U)
502 if ((uintptr_t)cnt
> (uintptr_t)1U)
505 mb_write(); /* Not strictly necessary. */
509 prof_free(const void *ptr
, size_t size
)
511 prof_ctx_t
*ctx
= prof_ctx_get(ptr
);
513 if ((uintptr_t)ctx
> (uintptr_t)1) {
514 assert(size
== isalloc(ptr
));
515 prof_thr_cnt_t
*tcnt
= prof_lookup(ctx
->bt
);
522 tcnt
->cnts
.curobjs
--;
523 tcnt
->cnts
.curbytes
-= size
;
533 * OOM during free() cannot be propagated, so operate
534 * directly on cnt->ctx->cnt_merged.
536 malloc_mutex_lock(&ctx
->lock
);
537 ctx
->cnt_merged
.curobjs
--;
538 ctx
->cnt_merged
.curbytes
-= size
;
539 malloc_mutex_unlock(&ctx
->lock
);
545 #endif /* JEMALLOC_H_INLINES */
546 /******************************************************************************/
547 #endif /* JEMALLOC_PROF */