]> git.saurik.com Git - apple/libdispatch.git/blob - testing/bench.mm
libdispatch-84.5.5.tar.gz
[apple/libdispatch.git] / testing / bench.mm
1 #include <Foundation/Foundation.h>
2 #include <libkern/OSAtomic.h>
3 #include <sys/sysctl.h>
4 #include <mach/mach.h>
5 #include <mach/mach_time.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <stdint.h>
9 #include <stdbool.h>
10 #include <unistd.h>
11 #include <assert.h>
12 #include <errno.h>
13 #include <pthread.h>
14 #include <math.h>
15 #ifdef __BLOCKS__
16 #include <Block.h>
17 #endif
18 #include <dispatch/dispatch.h>
19 #include <dispatch/private.h>
20
21 extern "C" {
22 __private_extern__ void func(void);
23 #ifdef __BLOCKS__
24 __private_extern__ void (^block)(void);
25 #endif
26 static void backflip(void *ctxt);
27 static void backflip_done(void);
28 }
29
30 @interface BasicObject : NSObject
31 {
32 }
33 - (void) method;
34 @end
35
36 @implementation BasicObject
37 - (void) method
38 {
39 }
40 @end
41
42 class BasicClass {
43 public:
44 virtual void virtfunc(void) {
45 };
46 };
47
48 static void *
49 force_a_thread(void *arg)
50 {
51 pause();
52 abort();
53 return arg;
54 }
55
56 static volatile int32_t global;
57
58 static const size_t cnt = 10000000;
59 static const size_t cnt2 = 100000;
60
61 static uint64_t bfs;
62 static long double loop_cost;
63 static long double cycles_per_nanosecond;
64 static mach_timebase_info_data_t tbi;
65
66 //static void func2(void *, dispatch_item_t di);
67
68 static void __attribute__((noinline))
69 print_result(uint64_t s, const char *str)
70 {
71 uint64_t d, e = mach_absolute_time();
72 long double dd;
73
74 d = e - s;
75
76 if (tbi.numer != tbi.denom) {
77 d *= tbi.numer;
78 d /= tbi.denom;
79 }
80
81 dd = (typeof(dd))d / (typeof(dd))cnt;
82
83 dd -= loop_cost;
84
85 if (loop_cost == 0.0) {
86 loop_cost = dd;
87 }
88
89 dd *= cycles_per_nanosecond;
90
91 printf("%-45s%15.3Lf cycles\n", str, dd);
92 }
93
94 static void __attribute__((noinline))
95 print_result2(uint64_t s, const char *str)
96 {
97 uint64_t d, e = mach_absolute_time();
98 long double dd;
99
100 d = e - s;
101
102 if (tbi.numer != tbi.denom) {
103 d *= tbi.numer;
104 d /= tbi.denom;
105 }
106
107 dd = (typeof(dd))d / (typeof(dd))cnt2;
108
109 dd -= loop_cost;
110 dd *= cycles_per_nanosecond;
111
112 printf("%-45s%15.3Lf cycles\n", str, dd);
113 }
114
115 #if defined(__i386__) || defined(__x86_64__)
116 static inline uint64_t
117 rdtsc(void)
118 {
119 uint32_t lo, hi;
120
121 asm volatile("rdtsc" : "=a" (lo), "=d" (hi));
122
123 return (uint64_t)hi << 32 | lo;
124 }
125 #endif
126
127 static struct fml {
128 struct fml *fml_next;
129 } *fixed_malloc_lifo_head;
130
131 struct fml *fixed_malloc_lifo(void);// __attribute__((noinline));
132 void fixed_free_lifo(struct fml *fml);// __attribute__((noinline));
133
134 struct fml *
135 fixed_malloc_lifo(void)
136 {
137 struct fml *fml_r = fixed_malloc_lifo_head;
138
139 if (fml_r) {
140 fixed_malloc_lifo_head = fml_r->fml_next;
141 return fml_r;
142 } else {
143 return (struct fml *)malloc(32);
144 }
145 }
146
147 void
148 fixed_free_lifo(struct fml *fml)
149 {
150 fml->fml_next = fixed_malloc_lifo_head;
151 fixed_malloc_lifo_head = fml;
152 }
153
154 int
155 main(void)
156 {
157 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
158 pthread_mutex_t plock = PTHREAD_MUTEX_INITIALIZER;
159 OSSpinLock slock = OS_SPINLOCK_INIT;
160 BasicObject *bo;
161 BasicClass *bc;
162 pthread_t pthr_pause;
163 dispatch_queue_t q, mq;
164 kern_return_t kr;
165 semaphore_t sem;
166 uint64_t freq;
167 uint64_t s;
168 size_t freq_len = sizeof(freq);
169 size_t bf_cnt = cnt;
170 unsigned i;
171 int r;
172
173 r = sysctlbyname("hw.cpufrequency", &freq, &freq_len, NULL, 0);
174 assert(r != -1);
175 assert(freq_len == sizeof(freq));
176
177 cycles_per_nanosecond = (long double)freq / (long double)NSEC_PER_SEC;
178
179 assert(pool);
180
181 /* Malloc has different logic for threaded apps. */
182 r = pthread_create(&pthr_pause, NULL, force_a_thread, NULL);
183 assert(r == 0);
184
185 kr = mach_timebase_info(&tbi);
186 assert(kr == 0);
187 #if defined(__i386__) || defined(__x86_64__)
188 assert(tbi.numer == tbi.denom); /* This will fail on PowerPC. */
189 #endif
190
191 bo = [[BasicObject alloc] init];
192 assert(bo);
193
194 bc = new BasicClass();
195 assert(bc);
196
197 q = dispatch_queue_create("com.apple.bench-dispatch", NULL);
198 assert(q);
199
200 mq = dispatch_get_main_queue();
201 assert(mq);
202
203 printf("%-45s%15Lf\n\n", "Cycles per nanosecond:", cycles_per_nanosecond);
204
205 s = mach_absolute_time();
206 for (i = cnt; i; i--) {
207 asm volatile("");
208 }
209 print_result(s, "Empty loop:");
210
211 printf("\nLoop cost subtracted from the following:\n\n");
212
213 s = mach_absolute_time();
214 for (i = cnt; i; i--) {
215 mach_absolute_time();
216 }
217 print_result(s, "mach_absolute_time():");
218
219 #if defined(__i386__) || defined(__x86_64__)
220 s = mach_absolute_time();
221 for (i = cnt; i; i--) {
222 rdtsc();
223 }
224 print_result(s, "rdtsc():");
225 #endif
226
227 s = mach_absolute_time();
228 for (i = cnt2; i; i--) {
229 pthread_t pthr;
230 void *pr;
231
232 r = pthread_create(&pthr, NULL, (void *(*)(void *))func, NULL);
233 assert(r == 0);
234 r = pthread_join(pthr, &pr);
235 assert(r == 0);
236 }
237 print_result2(s, "pthread create+join:");
238
239 s = mach_absolute_time();
240 for (i = cnt2; i; i--) {
241 kr = semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
242 assert(kr == 0);
243 kr = semaphore_destroy(mach_task_self(), sem);
244 assert(kr == 0);
245 }
246 print_result2(s, "Mach semaphore create/destroy:");
247
248 kr = semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
249 assert(kr == 0);
250 s = mach_absolute_time();
251 for (i = cnt2; i; i--) {
252 kr = semaphore_signal(sem);
253 assert(kr == 0);
254 }
255 print_result2(s, "Mach semaphore signal:");
256 kr = semaphore_destroy(mach_task_self(), sem);
257 assert(kr == 0);
258
259 s = mach_absolute_time();
260 for (i = cnt; i; i--) {
261 pthread_yield_np();
262 }
263 print_result(s, "pthread_yield_np():");
264
265 s = mach_absolute_time();
266 for (i = cnt; i; i--) {
267 free(malloc(32));
268 }
269 print_result(s, "free(malloc(32)):");
270
271 s = mach_absolute_time();
272 for (i = cnt / 2; i; i--) {
273 void *m1 = malloc(32);
274 void *m2 = malloc(32);
275 free(m1);
276 free(m2);
277 }
278 print_result(s, "Avoiding the MRU cache of free(malloc(32)):");
279
280 s = mach_absolute_time();
281 for (i = cnt; i; i--) {
282 fixed_free_lifo(fixed_malloc_lifo());
283 }
284 print_result(s, "per-thread/fixed free(malloc(32)):");
285
286 s = mach_absolute_time();
287 for (i = cnt; i; i--) {
288 assert(strtoull("18446744073709551615", NULL, 0) == ~0ull);
289 }
290 print_result(s, "strtoull(\"18446744073709551615\") == ~0ull:");
291
292 s = mach_absolute_time();
293 for (i = cnt; i; i--) {
294 func();
295 }
296 print_result(s, "Empty function call:");
297
298 #ifdef __BLOCKS__
299 s = mach_absolute_time();
300 for (i = cnt; i; i--) {
301 block();
302 }
303 print_result(s, "Empty block call:");
304 #endif
305
306 s = mach_absolute_time();
307 for (i = cnt; i; i--) {
308 [bo method];
309 }
310 print_result(s, "Empty ObjC call:");
311
312 s = mach_absolute_time();
313 for (i = cnt; i; i--) {
314 bc->virtfunc();
315 }
316 print_result(s, "Empty C++ virtual call:");
317
318 s = mach_absolute_time();
319 for (i = cnt2; i; i--) {
320 [bo description];
321 }
322 print_result2(s, "\"description\" ObjC call:");
323
324 [pool release];
325
326 pool = NULL;
327
328 #if defined(__i386__) || defined(__x86_64__)
329 s = mach_absolute_time();
330 for (i = cnt; i; i--) {
331 asm("nop");
332 }
333 print_result(s, "raw 'nop':");
334
335 s = mach_absolute_time();
336 for (i = cnt; i; i--) {
337 asm("pause");
338 }
339 print_result(s, "raw 'pause':");
340
341 s = mach_absolute_time();
342 for (i = cnt; i; i--) {
343 asm("mfence");
344 }
345 print_result(s, "Atomic mfence:");
346
347 s = mach_absolute_time();
348 for (i = cnt; i; i--) {
349 asm("lfence");
350 }
351 print_result(s, "Atomic lfence:");
352
353 s = mach_absolute_time();
354 for (i = cnt; i; i--) {
355 asm("sfence");
356 }
357 print_result(s, "Atomic sfence:");
358
359 s = mach_absolute_time();
360 for (i = cnt; i; i--) {
361 uint64_t sidt_rval;
362 asm("sidt %0" : "=m" (sidt_rval));
363 }
364 print_result(s, "'sidt' instruction:");
365
366 s = mach_absolute_time();
367 for (i = cnt; i; i--) {
368 int prev;
369 asm volatile("cmpxchg %1,%2" : "=a" (prev) : "r" (0l), "m" (global), "0" (1l));
370 }
371 print_result(s, "'cmpxchg' without the 'lock' prefix:");
372 #endif
373
374 s = mach_absolute_time();
375 for (i = cnt; i; i--) {
376 __sync_lock_test_and_set(&global, 0);
377 }
378 print_result(s, "Atomic xchg:");
379
380 s = mach_absolute_time();
381 for (i = cnt; i; i--) {
382 __sync_val_compare_and_swap(&global, 1, 0);
383 }
384 print_result(s, "Atomic cmpxchg:");
385
386 s = mach_absolute_time();
387 for (i = cnt; i; i--) {
388 __sync_fetch_and_add(&global, 1);
389 }
390 print_result(s, "Atomic increment:");
391
392 global = 0;
393
394 s = mach_absolute_time();
395 for (i = cnt; i; i--) {
396 OSAtomicIncrement32Barrier(&global);
397 }
398 print_result(s, "OSAtomic increment:");
399
400 global = 0;
401
402 s = mach_absolute_time();
403 for (i = cnt; i; i--) {
404 while (!__sync_bool_compare_and_swap(&global, 0, 1)) {
405 do {
406 #if defined(__i386__) || defined(__x86_64__)
407 asm("pause");
408 #endif
409 } while (global);
410 }
411 global = 0;
412 }
413 print_result(s, "Inlined spin lock/unlock:");
414
415 s = mach_absolute_time();
416 for (i = cnt; i; i--) {
417 OSSpinLockLock(&slock);
418 OSSpinLockUnlock(&slock);
419 }
420 print_result(s, "OS spin lock/unlock:");
421
422 s = mach_absolute_time();
423 for (i = cnt; i; i--) {
424 r = pthread_mutex_lock(&plock);
425 assert(r == 0);
426 r = pthread_mutex_unlock(&plock);
427 assert(r == 0);
428 }
429 print_result(s, "pthread lock/unlock:");
430
431 #ifdef __BLOCKS__
432 s = mach_absolute_time();
433 for (i = cnt; i; i--) {
434 dispatch_sync(q, ^{ });
435 }
436 print_result(s, "dispatch_sync:");
437 #endif
438
439 s = mach_absolute_time();
440 for (i = cnt; i; i--) {
441 dispatch_sync_f(q, NULL, (void (*)(void *))func);
442 }
443 print_result(s, "dispatch_sync_f:");
444
445 #ifdef __BLOCKS__
446 s = mach_absolute_time();
447 for (i = cnt; i; i--) {
448 dispatch_barrier_sync(q, ^{ });
449 }
450 print_result(s, "dispatch_barrier_sync:");
451 #endif
452
453 s = mach_absolute_time();
454 for (i = cnt; i; i--) {
455 dispatch_barrier_sync_f(q, NULL, (void (*)(void *))func);
456 }
457 print_result(s, "dispatch_barrier_sync_f:");
458
459 s = mach_absolute_time();
460 dispatch_apply_f(cnt, dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), NULL, (void (*)(void *, size_t))func);
461 s += loop_cost; /* cancel out the implicit subtraction done by the next line */
462 print_result(s, "dispatch_apply_f():");
463
464 // we do a "double backflip" to hit the fast-path of the enqueue/dequeue logic
465 bfs = mach_absolute_time();
466 dispatch_async_f(dispatch_get_main_queue(), &bf_cnt, backflip);
467 dispatch_async_f(dispatch_get_main_queue(), &bf_cnt, backflip);
468
469 dispatch_main();
470 }
471
472 __attribute__((noinline))
473 void
474 backflip_done(void)
475 {
476 print_result(bfs, "dispatch_async_f():");
477 exit(EXIT_SUCCESS);
478 }
479
480 void
481 backflip(void *ctxt)
482 {
483 size_t *bf_cnt = (size_t *)ctxt;
484 if (--(*bf_cnt)) {
485 return dispatch_async_f(dispatch_get_main_queue(), ctxt, backflip);
486 }
487 backflip_done();
488 }