1 #include <Foundation/Foundation.h>
2 #include <libkern/OSAtomic.h>
3 #include <sys/sysctl.h>
5 #include <mach/mach_time.h>
18 #include <dispatch/dispatch.h>
19 #include <dispatch/private.h>
22 __private_extern__ void func(void);
24 __private_extern__ void (^block)(void);
26 static void backflip(void *ctxt);
27 static void backflip_done(void);
30 @interface BasicObject : NSObject
36 @implementation BasicObject
44 virtual void virtfunc(void) {
49 force_a_thread(void *arg)
56 static volatile int32_t global;
58 static const size_t cnt = 10000000;
59 static const size_t cnt2 = 100000;
62 static long double loop_cost;
63 static long double cycles_per_nanosecond;
64 static mach_timebase_info_data_t tbi;
66 //static void func2(void *, dispatch_item_t di);
68 static void __attribute__((noinline))
69 print_result(uint64_t s, const char *str)
71 uint64_t d, e = mach_absolute_time();
76 if (tbi.numer != tbi.denom) {
81 dd = (typeof(dd))d / (typeof(dd))cnt;
85 if (loop_cost == 0.0) {
89 dd *= cycles_per_nanosecond;
91 printf("%-45s%15.3Lf cycles\n", str, dd);
94 static void __attribute__((noinline))
95 print_result2(uint64_t s, const char *str)
97 uint64_t d, e = mach_absolute_time();
102 if (tbi.numer != tbi.denom) {
107 dd = (typeof(dd))d / (typeof(dd))cnt2;
110 dd *= cycles_per_nanosecond;
112 printf("%-45s%15.3Lf cycles\n", str, dd);
115 #if defined(__i386__) || defined(__x86_64__)
116 static inline uint64_t
121 asm volatile("rdtsc" : "=a" (lo), "=d" (hi));
123 return (uint64_t)hi << 32 | lo;
128 struct fml *fml_next;
129 } *fixed_malloc_lifo_head;
131 struct fml *fixed_malloc_lifo(void);// __attribute__((noinline));
132 void fixed_free_lifo(struct fml *fml);// __attribute__((noinline));
135 fixed_malloc_lifo(void)
137 struct fml *fml_r = fixed_malloc_lifo_head;
140 fixed_malloc_lifo_head = fml_r->fml_next;
143 return (struct fml *)malloc(32);
148 fixed_free_lifo(struct fml *fml)
150 fml->fml_next = fixed_malloc_lifo_head;
151 fixed_malloc_lifo_head = fml;
157 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
158 pthread_mutex_t plock = PTHREAD_MUTEX_INITIALIZER;
159 OSSpinLock slock = OS_SPINLOCK_INIT;
162 pthread_t pthr_pause;
163 dispatch_queue_t q, mq;
168 size_t freq_len = sizeof(freq);
173 r = sysctlbyname("hw.cpufrequency", &freq, &freq_len, NULL, 0);
175 assert(freq_len == sizeof(freq));
177 cycles_per_nanosecond = (long double)freq / (long double)NSEC_PER_SEC;
181 /* Malloc has different logic for threaded apps. */
182 r = pthread_create(&pthr_pause, NULL, force_a_thread, NULL);
185 kr = mach_timebase_info(&tbi);
187 #if defined(__i386__) || defined(__x86_64__)
188 assert(tbi.numer == tbi.denom); /* This will fail on PowerPC. */
191 bo = [[BasicObject alloc] init];
194 bc = new BasicClass();
197 q = dispatch_queue_create("com.apple.bench-dispatch", NULL);
200 mq = dispatch_get_main_queue();
203 printf("%-45s%15Lf\n\n", "Cycles per nanosecond:", cycles_per_nanosecond);
205 s = mach_absolute_time();
206 for (i = cnt; i; i--) {
209 print_result(s, "Empty loop:");
211 printf("\nLoop cost subtracted from the following:\n\n");
213 s = mach_absolute_time();
214 for (i = cnt; i; i--) {
215 mach_absolute_time();
217 print_result(s, "mach_absolute_time():");
219 #if defined(__i386__) || defined(__x86_64__)
220 s = mach_absolute_time();
221 for (i = cnt; i; i--) {
224 print_result(s, "rdtsc():");
227 s = mach_absolute_time();
228 for (i = cnt2; i; i--) {
232 r = pthread_create(&pthr, NULL, (void *(*)(void *))func, NULL);
234 r = pthread_join(pthr, &pr);
237 print_result2(s, "pthread create+join:");
239 s = mach_absolute_time();
240 for (i = cnt2; i; i--) {
241 kr = semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
243 kr = semaphore_destroy(mach_task_self(), sem);
246 print_result2(s, "Mach semaphore create/destroy:");
248 kr = semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
250 s = mach_absolute_time();
251 for (i = cnt2; i; i--) {
252 kr = semaphore_signal(sem);
255 print_result2(s, "Mach semaphore signal:");
256 kr = semaphore_destroy(mach_task_self(), sem);
259 s = mach_absolute_time();
260 for (i = cnt; i; i--) {
263 print_result(s, "pthread_yield_np():");
265 s = mach_absolute_time();
266 for (i = cnt; i; i--) {
269 print_result(s, "free(malloc(32)):");
271 s = mach_absolute_time();
272 for (i = cnt / 2; i; i--) {
273 void *m1 = malloc(32);
274 void *m2 = malloc(32);
278 print_result(s, "Avoiding the MRU cache of free(malloc(32)):");
280 s = mach_absolute_time();
281 for (i = cnt; i; i--) {
282 fixed_free_lifo(fixed_malloc_lifo());
284 print_result(s, "per-thread/fixed free(malloc(32)):");
286 s = mach_absolute_time();
287 for (i = cnt; i; i--) {
288 assert(strtoull("18446744073709551615", NULL, 0) == ~0ull);
290 print_result(s, "strtoull(\"18446744073709551615\") == ~0ull:");
292 s = mach_absolute_time();
293 for (i = cnt; i; i--) {
296 print_result(s, "Empty function call:");
299 s = mach_absolute_time();
300 for (i = cnt; i; i--) {
303 print_result(s, "Empty block call:");
306 s = mach_absolute_time();
307 for (i = cnt; i; i--) {
310 print_result(s, "Empty ObjC call:");
312 s = mach_absolute_time();
313 for (i = cnt; i; i--) {
316 print_result(s, "Empty C++ virtual call:");
318 s = mach_absolute_time();
319 for (i = cnt2; i; i--) {
322 print_result2(s, "\"description\" ObjC call:");
328 #if defined(__i386__) || defined(__x86_64__)
329 s = mach_absolute_time();
330 for (i = cnt; i; i--) {
333 print_result(s, "raw 'nop':");
335 s = mach_absolute_time();
336 for (i = cnt; i; i--) {
339 print_result(s, "raw 'pause':");
341 s = mach_absolute_time();
342 for (i = cnt; i; i--) {
345 print_result(s, "Atomic mfence:");
347 s = mach_absolute_time();
348 for (i = cnt; i; i--) {
351 print_result(s, "Atomic lfence:");
353 s = mach_absolute_time();
354 for (i = cnt; i; i--) {
357 print_result(s, "Atomic sfence:");
359 s = mach_absolute_time();
360 for (i = cnt; i; i--) {
362 asm("sidt %0" : "=m" (sidt_rval));
364 print_result(s, "'sidt' instruction:");
366 s = mach_absolute_time();
367 for (i = cnt; i; i--) {
369 asm volatile("cmpxchg %1,%2" : "=a" (prev) : "r" (0l), "m" (global), "0" (1l));
371 print_result(s, "'cmpxchg' without the 'lock' prefix:");
374 s = mach_absolute_time();
375 for (i = cnt; i; i--) {
376 __sync_lock_test_and_set(&global, 0);
378 print_result(s, "Atomic xchg:");
380 s = mach_absolute_time();
381 for (i = cnt; i; i--) {
382 __sync_val_compare_and_swap(&global, 1, 0);
384 print_result(s, "Atomic cmpxchg:");
386 s = mach_absolute_time();
387 for (i = cnt; i; i--) {
388 __sync_fetch_and_add(&global, 1);
390 print_result(s, "Atomic increment:");
394 s = mach_absolute_time();
395 for (i = cnt; i; i--) {
396 OSAtomicIncrement32Barrier(&global);
398 print_result(s, "OSAtomic increment:");
402 s = mach_absolute_time();
403 for (i = cnt; i; i--) {
404 while (!__sync_bool_compare_and_swap(&global, 0, 1)) {
406 #if defined(__i386__) || defined(__x86_64__)
413 print_result(s, "Inlined spin lock/unlock:");
415 s = mach_absolute_time();
416 for (i = cnt; i; i--) {
417 OSSpinLockLock(&slock);
418 OSSpinLockUnlock(&slock);
420 print_result(s, "OS spin lock/unlock:");
422 s = mach_absolute_time();
423 for (i = cnt; i; i--) {
424 r = pthread_mutex_lock(&plock);
426 r = pthread_mutex_unlock(&plock);
429 print_result(s, "pthread lock/unlock:");
432 s = mach_absolute_time();
433 for (i = cnt; i; i--) {
434 dispatch_sync(q, ^{ });
436 print_result(s, "dispatch_sync:");
439 s = mach_absolute_time();
440 for (i = cnt; i; i--) {
441 dispatch_sync_f(q, NULL, (void (*)(void *))func);
443 print_result(s, "dispatch_sync_f:");
446 s = mach_absolute_time();
447 for (i = cnt; i; i--) {
448 dispatch_barrier_sync(q, ^{ });
450 print_result(s, "dispatch_barrier_sync:");
453 s = mach_absolute_time();
454 for (i = cnt; i; i--) {
455 dispatch_barrier_sync_f(q, NULL, (void (*)(void *))func);
457 print_result(s, "dispatch_barrier_sync_f:");
459 s = mach_absolute_time();
460 dispatch_apply_f(cnt, dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), NULL, (void (*)(void *, size_t))func);
461 s += loop_cost; /* cancel out the implicit subtraction done by the next line */
462 print_result(s, "dispatch_apply_f():");
464 // we do a "double backflip" to hit the fast-path of the enqueue/dequeue logic
465 bfs = mach_absolute_time();
466 dispatch_async_f(dispatch_get_main_queue(), &bf_cnt, backflip);
467 dispatch_async_f(dispatch_get_main_queue(), &bf_cnt, backflip);
472 __attribute__((noinline))
476 print_result(bfs, "dispatch_async_f():");
483 size_t *bf_cnt = (size_t *)ctxt;
485 return dispatch_async_f(dispatch_get_main_queue(), ctxt, backflip);