6 #include <sys/sysctl.h>
9 #include <pthread/pthread_spis.h>
11 #include <darwintest.h>
13 // number of times the lock is taken per dt_stat batch
14 #define ITERATIONS_PER_DT_STAT_BATCH 10000ull
15 // number of times the contended mutex is taken per dt_stat batch
16 #define ITERATIONS_PER_DT_STAT_BATCH_CONTENDED_MUTEX 1000ull
17 // shift determining power of 2 factor of time spent by worker threads in the
18 // busy() function while outside of the lock vs inside the lock
19 #define OUTER_VS_INNER_SHIFT 4
20 // fraction of read lock vs write lock acquires
21 #define RDLOCK_FRACTION 0.99f
22 // maintain and print progress counters in between measurement batches
25 // move the darwintest assertion code out of the straight line execution path
26 // since it is has non-trivial overhead and codegen impact even if the assertion
27 // is never triggered.
28 #define iferr(_e) if(__builtin_expect(!!(_e), 0))
33 random_busy_counts(unsigned int *seed
, uint64_t *inner
, uint64_t *outer
)
35 uint64_t random
= rand_r(seed
);
36 const uint64_t of
= (1 << OUTER_VS_INNER_SHIFT
);
37 *inner
= 0x4 + (random
& (0x10 - 1));
38 *outer
= 0x4 * of
+ ((random
>> 4) & (0x10 * of
- 1));
42 // By default busy() does cpu busy work for a passed in number of iterations
48 static int busy_select
= busy_is_cpu_busy
;
55 for (i
= 0; i
< n
; i
++) d
*= M_PI
;
63 for (i
= 0; i
< n
; i
++) {
64 #if defined(__arm__) || defined(__arm64__)
65 asm volatile("yield");
66 #elif defined(__x86_64__) || defined(__i386__)
67 asm volatile("pause");
69 #error Unrecognized architecture
75 __attribute__((noinline
))
80 case busy_is_cpu_busy
:
82 case busy_is_cpu_yield
:
91 static semaphore_t ready_sem
, start_sem
, end_sem
;
92 static uint32_t nthreads
;
93 static _Atomic
uint32_t active_thr
;
94 static _Atomic
int64_t todo
;
95 uint64_t iterations_per_dt_stat_batch
= ITERATIONS_PER_DT_STAT_BATCH
;
98 static _Atomic
uint64_t total_locks
, total_rdlocks
, total_wrlocks
;
99 #define ctr_inc(_t) atomic_fetch_add_explicit(&(_t), 1, memory_order_relaxed)
107 static uint32_t activecpu
, physicalcpu
;
110 size_t s
= sizeof(n
);
111 sysctlbyname("hw.activecpu", &n
, &s
, NULL
, 0);
114 sysctlbyname("hw.physicalcpu", &n
, &s
, NULL
, 0);
117 return MIN(activecpu
, physicalcpu
);
120 __attribute__((noinline
))
122 threaded_bench(dt_stat_time_t s
, int batch_size
)
125 for (int i
= 0; i
< nthreads
; i
++) {
126 kr
= semaphore_wait(ready_sem
);
127 iferr (kr
) {T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_wait");}
129 atomic_init(&active_thr
, nthreads
);
130 atomic_init(&todo
, batch_size
* iterations_per_dt_stat_batch
);
131 dt_stat_token t
= dt_stat_begin(s
);
132 kr
= semaphore_signal_all(start_sem
);
133 iferr (kr
) {T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_signal_all");}
134 kr
= semaphore_wait(end_sem
);
135 iferr (kr
) {T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_wait");}
136 dt_stat_end_batch(s
, batch_size
, t
);
140 setup_threaded_bench(void* (*thread_fn
)(void*), bool singlethreaded
)
146 if (singlethreaded
) {
149 if ((e
= getenv("DT_STAT_NTHREADS"))) nthreads
= strtoul(e
, NULL
, 0);
150 if (nthreads
< 2) nthreads
= ncpu();
152 if ((e
= getenv("DT_STAT_CPU_BUSY"))) busy_select
= strtoul(e
, NULL
, 0);
154 kr
= semaphore_create(mach_task_self(), &ready_sem
, SYNC_POLICY_FIFO
, 0);
155 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_create");
156 kr
= semaphore_create(mach_task_self(), &start_sem
, SYNC_POLICY_FIFO
, 0);
157 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_create");
158 kr
= semaphore_create(mach_task_self(), &end_sem
, SYNC_POLICY_FIFO
, 0);
159 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_create");
162 r
= pthread_attr_init(&attr
);
163 T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "pthread_attr_init");
164 r
= pthread_attr_setdetachstate(&attr
, PTHREAD_CREATE_DETACHED
);
165 T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "pthread_attr_setdetachstate");
167 for (int i
= 0; i
< nthreads
; i
++) {
169 r
= pthread_create(&th
, &attr
, thread_fn
, (void *)(uintptr_t)(i
+1));
170 T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "pthread_create");
176 static pthread_mutex_t mutex
;
179 mutex_bench_thread(void * arg
)
184 volatile double dummy
;
187 seed
= (uintptr_t)arg
; // each thread repeats its own sequence
188 kr
= semaphore_wait_signal(start_sem
, ready_sem
);
189 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_wait_signal");
191 while (atomic_fetch_sub_explicit(&todo
, 1, memory_order_relaxed
) > 0) {
192 uint64_t inner
, outer
;
193 random_busy_counts(&seed
, &inner
, &outer
);
195 r
= pthread_mutex_lock(&mutex
);
196 iferr (r
) {T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "mutex_lock");}
198 r
= pthread_mutex_unlock(&mutex
);
199 iferr (r
) {T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "mutex_unlock");}
200 ctr_inc(total_locks
);
203 if (atomic_fetch_sub_explicit(&active_thr
, 1, memory_order_relaxed
) == 1) {
204 kr
= semaphore_signal(end_sem
);
205 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_signal");
211 mutex_bench(bool singlethreaded
)
219 setup_threaded_bench(mutex_bench_thread
, singlethreaded
);
221 pthread_mutexattr_t attr
;
222 r
= pthread_mutexattr_init(&attr
);
223 T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "mutexattr_init");
224 pthread_mutexattr_setpolicy_np(&attr
, _PTHREAD_MUTEX_POLICY_FAIRSHARE
);
225 T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "mutexattr_setpolicy_np");
226 r
= pthread_mutex_init(&mutex
, &attr
);
227 T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "mutex_init");
229 dt_stat_time_t s
= dt_stat_time_create("%llu pthread_mutex_lock & "
230 "pthread_mutex_unlock (fairshare) on %u thread%s",
231 iterations_per_dt_stat_batch
, nthreads
, nthreads
> 1 ? "s" : "");
233 batch_size
= dt_stat_batch_size(s
);
234 threaded_bench(s
, batch_size
);
236 fprintf(stderr
, "\rbatch: %4llu\t size: %4d\tmutexes: %8llu",
238 atomic_load_explicit(&total_locks
, memory_order_relaxed
));
240 } while (!dt_stat_stable(s
));
242 fprintf(stderr
, "\n");
247 T_DECL(perf_uncontended_mutex_bench
, "Uncontended fairshare mutex",
248 T_META_TYPE_PERF
, T_META_ALL_VALID_ARCHS(NO
),
249 T_META_LTEPHASE(LTE_POSTINIT
), T_META_CHECK_LEAKS(false))
254 T_DECL(perf_contended_mutex_bench
, "Contended fairshare mutex",
255 T_META_TYPE_PERF
, T_META_ALL_VALID_ARCHS(NO
),
256 T_META_LTEPHASE(LTE_POSTINIT
), T_META_CHECK_LEAKS(false))
258 iterations_per_dt_stat_batch
= ITERATIONS_PER_DT_STAT_BATCH_CONTENDED_MUTEX
;
264 static pthread_rwlock_t rwlock
;
267 rwlock_bench_thread(void * arg
)
272 volatile double dummy
;
273 const uint64_t rand_rdlock_max
= (double)RAND_MAX
* RDLOCK_FRACTION
;
276 seed
= (uintptr_t)arg
; // each thread repeats its own sequence
277 kr
= semaphore_wait_signal(start_sem
, ready_sem
);
278 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_wait_signal");
280 while (atomic_fetch_sub_explicit(&todo
, 1, memory_order_relaxed
) > 0) {
281 uint64_t inner
, outer
;
282 uint64_t random
= random_busy_counts(&seed
, &inner
, &outer
);
284 if (random
< rand_rdlock_max
) {
285 r
= pthread_rwlock_rdlock(&rwlock
);
286 iferr (r
) {T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "rwlock_rdlock");}
288 r
= pthread_rwlock_unlock(&rwlock
);
289 iferr (r
) {T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "rwlock_unlock");}
290 ctr_inc(total_rdlocks
);
292 r
= pthread_rwlock_wrlock(&rwlock
);
293 iferr (r
) {T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "rwlock_wrlock");}
295 r
= pthread_rwlock_unlock(&rwlock
);
296 iferr (r
) {T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "rwlock_unlock");}
297 ctr_inc(total_wrlocks
);
299 ctr_inc(total_locks
);
302 if (atomic_fetch_sub_explicit(&active_thr
, 1, memory_order_relaxed
) == 1) {
303 kr
= semaphore_signal(end_sem
);
304 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_signal");
310 rwlock_bench(bool singlethreaded
)
318 setup_threaded_bench(rwlock_bench_thread
, singlethreaded
);
320 r
= pthread_rwlock_init(&rwlock
, NULL
);
321 T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "rwlock_init");
323 dt_stat_time_t s
= dt_stat_time_create("%llu pthread_rwlock_rd/wrlock & "
324 "pthread_rwlock_unlock (%.0f%% rdlock) on %u thread%s",
325 iterations_per_dt_stat_batch
, RDLOCK_FRACTION
* 100, nthreads
,
326 nthreads
> 1 ? "s" : "");
328 batch_size
= dt_stat_batch_size(s
);
329 threaded_bench(s
, batch_size
);
331 fprintf(stderr
, "\rbatch: %4llu\t size: %4d\trwlocks: %8llu\t"
332 "rd: %8llu\twr: %8llu", ++batch
, batch_size
,
333 atomic_load_explicit(&total_locks
, memory_order_relaxed
),
334 atomic_load_explicit(&total_rdlocks
, memory_order_relaxed
),
335 atomic_load_explicit(&total_wrlocks
, memory_order_relaxed
));
337 } while (!dt_stat_stable(s
));
339 fprintf(stderr
, "\n");
344 T_DECL(perf_uncontended_rwlock_bench
, "Uncontended rwlock",
345 T_META_TYPE_PERF
, T_META_ALL_VALID_ARCHS(NO
),
346 T_META_LTEPHASE(LTE_POSTINIT
), T_META_CHECK_LEAKS(false))
351 T_DECL(perf_contended_rwlock_bench
, "Contended rwlock",
352 T_META_TYPE_PERF
, T_META_ALL_VALID_ARCHS(NO
),
353 T_META_LTEPHASE(LTE_POSTINIT
), T_META_CHECK_LEAKS(false))
360 static os_unfair_lock unfair_lock
;
363 unfair_lock_bench_thread(void * arg
)
367 volatile double dummy
;
370 seed
= (uintptr_t)arg
; // each thread repeats its own sequence
371 kr
= semaphore_wait_signal(start_sem
, ready_sem
);
372 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_wait_signal");
374 while (atomic_fetch_sub_explicit(&todo
, 1, memory_order_relaxed
) > 0) {
375 uint64_t inner
, outer
;
376 random_busy_counts(&seed
, &inner
, &outer
);
378 os_unfair_lock_lock(&unfair_lock
);
380 os_unfair_lock_unlock(&unfair_lock
);
381 ctr_inc(total_locks
);
384 if (atomic_fetch_sub_explicit(&active_thr
, 1, memory_order_relaxed
) == 1) {
385 kr
= semaphore_signal(end_sem
);
386 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_signal");
392 unfair_lock_bench(bool singlethreaded
)
400 setup_threaded_bench(unfair_lock_bench_thread
, singlethreaded
);
402 dt_stat_time_t s
= dt_stat_time_create("%llu os_unfair_lock_lock & "
403 "os_unfair_lock_unlock on %u thread%s",
404 iterations_per_dt_stat_batch
, nthreads
, nthreads
> 1 ? "s" : "");
406 batch_size
= dt_stat_batch_size(s
);
407 threaded_bench(s
, batch_size
);
409 fprintf(stderr
, "\rbatch: %4llu\t size: %4d\tunfair_locks: %8llu",
411 atomic_load_explicit(&total_locks
, memory_order_relaxed
));
413 } while (!dt_stat_stable(s
));
415 fprintf(stderr
, "\n");
420 T_DECL(perf_uncontended_unfair_lock_bench
, "Unontended unfair lock",
421 T_META_TYPE_PERF
, T_META_ALL_VALID_ARCHS(NO
),
422 T_META_LTEPHASE(LTE_POSTINIT
), T_META_CHECK_LEAKS(false))
424 unfair_lock_bench(true);
427 T_DECL(perf_contended_unfair_lock_bench
, "Contended unfair lock",
428 T_META_TYPE_PERF
, T_META_ALL_VALID_ARCHS(NO
),
429 T_META_LTEPHASE(LTE_POSTINIT
), T_META_CHECK_LEAKS(false))
431 unfair_lock_bench(false);
436 static pthread_mutex_t ffmutex
;
439 ffmutex_bench_thread(void * arg
)
444 volatile double dummy
;
447 seed
= (uintptr_t)arg
; // each thread repeats its own sequence
448 kr
= semaphore_wait_signal(start_sem
, ready_sem
);
449 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_wait_signal");
451 while (atomic_fetch_sub_explicit(&todo
, 1, memory_order_relaxed
) > 0) {
452 uint64_t inner
, outer
;
453 random_busy_counts(&seed
, &inner
, &outer
);
455 r
= pthread_mutex_lock(&ffmutex
);
456 iferr (r
) {T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "mutex_lock");}
458 r
= pthread_mutex_unlock(&ffmutex
);
459 iferr (r
) {T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "mutex_unlock");}
460 ctr_inc(total_locks
);
463 if (atomic_fetch_sub_explicit(&active_thr
, 1, memory_order_relaxed
) == 1) {
464 kr
= semaphore_signal(end_sem
);
465 T_QUIET
; T_ASSERT_MACH_SUCCESS(kr
, "semaphore_signal");
471 ffmutex_bench(bool singlethreaded
)
479 setup_threaded_bench(ffmutex_bench_thread
, singlethreaded
);
481 pthread_mutexattr_t attr
;
482 r
= pthread_mutexattr_init(&attr
);
483 T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "mutexattr_init");
484 pthread_mutexattr_setpolicy_np(&attr
, _PTHREAD_MUTEX_POLICY_FIRSTFIT
);
485 T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "mutexattr_setpolicy_np");
486 r
= pthread_mutex_init(&ffmutex
, &attr
);
487 T_QUIET
; T_ASSERT_POSIX_ZERO(r
, "mutex_init");
489 dt_stat_time_t s
= dt_stat_time_create("%llu pthread_mutex_lock & "
490 "pthread_mutex_unlock (first-fit) on %u thread%s",
491 iterations_per_dt_stat_batch
, nthreads
, nthreads
> 1 ? "s" : "");
493 batch_size
= dt_stat_batch_size(s
);
494 threaded_bench(s
, batch_size
);
496 fprintf(stderr
, "\rbatch: %4llu\t size: %4d\tffmutexes: %8llu",
498 atomic_load_explicit(&total_locks
, memory_order_relaxed
));
500 } while (!dt_stat_stable(s
));
502 fprintf(stderr
, "\n");
507 T_DECL(perf_uncontended_ffmutex_bench
, "Uncontended first-fit mutex",
508 T_META_TYPE_PERF
, T_META_ALL_VALID_ARCHS(NO
),
509 T_META_LTEPHASE(LTE_POSTINIT
), T_META_CHECK_LEAKS(false))
514 T_DECL(perf_contended_ffmutex_bench
, "Contended first-fit mutex",
515 T_META_TYPE_PERF
, T_META_ALL_VALID_ARCHS(NO
),
516 T_META_LTEPHASE(LTE_POSTINIT
), T_META_CHECK_LEAKS(false))
518 ffmutex_bench(false);