1 #include <libkern/OSAtomic.h>
2 #include <sys/sysctl.h>
4 #include <mach/mach_time.h>
14 #include <sys/proc_info.h>
15 #include <dispatch/dispatch.h>
16 // #include "../src/private.h"
21 #define SMALL_LOOP 1000
23 void report(const char *func
, char *full_name
, double x
, unsigned long loops
, char *unit
) {
24 // XXX: make cols pretty & stuff
25 const char *prefix
= "bench_";
26 const int plen
= strlen(prefix
);
27 assert(!strncmp(func
, prefix
, plen
));
30 asprintf(&name
, "[%s] %s", func
, full_name
);
35 if (!strcmp("mach", unit
)) {
36 static mach_timebase_info_data_t mtb
;
38 (void)mach_timebase_info(&mtb
);
40 x
= (x
* mtb
.numer
) / mtb
.denom
;
44 printf("%-64s %13f%-2s\n", name
, x
, unit
);
48 void bench_queue_mem_use(void) {
49 struct proc_taskinfo pti
;
52 // The 1st call eats a little memory that isn't accounted for
53 // until the 2nd call. Also the _first_ printf eats >1M, so
54 // if you insert some for debugging make sure it isn't the first!
55 proc_pidinfo(getpid(), PROC_PIDTASKINFO
, 0, &pti
, sizeof(pti
));
56 proc_pidinfo(getpid(), PROC_PIDTASKINFO
, 0, &pti
, sizeof(pti
));
57 target_size
= pti
.pti_virtual_size
+ 1024*1024;
60 for(n
= 0; target_size
>= pti
.pti_virtual_size
; n
++) {
61 dispatch_queue_t leak
= dispatch_queue_create("to be deleted", NULL
);
63 proc_pidinfo(getpid(), PROC_PIDTASKINFO
, 0, &pti
, sizeof(pti
));
64 //printf("pti_virtual_size %qd; togo %qd, n %d\n", pti.pti_virtual_size, target_size - pti.pti_virtual_size, n);
67 report(__FUNCTION__
, "#queues to grow VSIZE 1Mbyte", n
-1, 1, "x");
70 void bench_message_round_trip(void) {
71 dispatch_queue_t q1
= dispatch_queue_create("q1", NULL
);
72 dispatch_queue_t q2
= dispatch_queue_create("q2", NULL
);
73 uint64_t start
= mach_absolute_time();
76 for(i
= 0; i
< LOOP
; i
++) {
77 // make sure we don't build up too much of a backlog
78 if (i
&& !(i
& 0x3ff)) {
79 dispatch_sync(q2
, ^{});
81 dispatch_queue_retain(q2
);
84 dispatch_queue_release(q2
);
89 // Make sure eveything has drained before we take the end timestamp
90 dispatch_sync(q1
, ^{});
91 dispatch_sync(q2
, ^{});
93 uint64_t end
= mach_absolute_time();
94 report(__FUNCTION__
, "round trip (async async - implicit copy)", (end
- start
), LOOP
, "mach");
95 dispatch_queue_release(q1
);
96 dispatch_queue_release(q2
);
99 void bench_precopy_message_round_trip(void) {
100 dispatch_queue_t q1
= dispatch_queue_create("q1", NULL
);
101 dispatch_queue_t q2
= dispatch_queue_create("q2", NULL
);
106 dispatch_block_t b2
= Block_copy(^{
108 dispatch_block_t b1
= Block_copy(^{
109 unsigned long rc
= dispatch_async(q2
, b2
);
111 dispatch_queue_release(q2
);
113 dispatch_block_t be
= Block_copy(^{});
115 uint64_t start
= mach_absolute_time();
118 for(i
= 0; i
< LOOP
; i
++) {
119 // make sure we don't build up too much of a backlog
120 if (i
&& !(i
& 0x3ff)) {
121 dispatch_sync(q2
, be
);
123 dispatch_queue_retain(q2
);
124 rc
= dispatch_async(q1
, b1
);
128 // Make sure eveything has drained before we take the end timestamp
129 dispatch_sync(q1
, be
);
130 dispatch_sync(q2
, be
);
132 uint64_t end
= mach_absolute_time();
133 report(__FUNCTION__
, "round trip (a/a - precopy)", (end
- start
), LOOP
, "mach");
134 dispatch_queue_release(q1
);
135 dispatch_queue_release(q2
);
138 void bench_message_round_type_syncasync(void) {
139 dispatch_queue_t q1
= dispatch_queue_create("q1", NULL
);
140 dispatch_queue_t q2
= dispatch_queue_create("q2", NULL
);
141 uint64_t start
= mach_absolute_time();
144 for(i
= 0; i
< LOOP
; i
++) {
145 dispatch_queue_retain(q2
);
147 dispatch_async(q2
, ^{
148 dispatch_queue_release(q2
);
153 // Make sure eveything has drained before we take the end timestamp
154 dispatch_sync(q1
, ^{});
155 dispatch_sync(q2
, ^{});
157 uint64_t end
= mach_absolute_time();
158 report(__FUNCTION__
, "round trip (s/a - implicit copy)", (end
- start
), LOOP
, "mach");
159 dispatch_queue_release(q1
);
160 dispatch_queue_release(q2
);
163 void nothing_f(void *ignored
) {
166 void brt_f_q1(void *vq2
) {
167 unsigned long rc
= dispatch_async_f((dispatch_queue_t
)vq2
, NULL
, nothing_f
);
171 void bench_message_round_trip_f(void) {
172 dispatch_queue_t q1
= dispatch_queue_create("q1", NULL
);
173 dispatch_queue_t q2
= dispatch_queue_create("q2", NULL
);
174 uint64_t start
= mach_absolute_time();
178 for(i
= 0; i
< LOOP
; i
++) {
179 // make sure we don't build up too much of a backlog
180 if (i
&& !(i
& 0x3ff)) {
181 dispatch_sync_f(q2
, NULL
, nothing_f
);
183 rc
= dispatch_async_f(q1
, q2
, brt_f_q1
);
187 // Make sure eveything has drained before we take the end timestamp
188 dispatch_sync_f(q1
, NULL
, nothing_f
);
189 dispatch_sync_f(q2
, NULL
, nothing_f
);
191 uint64_t end
= mach_absolute_time();
192 report(__FUNCTION__
, "round trip (a/a - no blocks)", (end
- start
), LOOP
, "mach");
193 dispatch_queue_release(q1
);
194 dispatch_queue_release(q2
);
197 void bench_message_round_type_syncasync_f(void) {
201 // should extend to keep data on times for latency calc
206 // Avoid false ache line shares. Big speed difference on a Mac Pro
207 char pad
[128 - sizeof(int)*3];
210 pthread_mutex_t kludge
;
211 static int n_baton_kludge
;
213 void pass(dispatch_queue_t
*q
, struct baton
*bat
, const int n_queues
, dispatch_queue_t complete_q
) {
214 //fprintf(stderr, "bat#%d q#%d, passes left: %d\n", bat->baton_number, bat->at_q, bat->baton_number);
215 if (0 == --(bat
->passes_left
)) {
216 dispatch_queue_resume(complete_q
);
218 if (!__sync_sub_and_fetch(&n_baton_kludge
, 1)) {
219 pthread_mutex_unlock(&kludge
);
223 bat
->at_q
= (bat
->at_q
+ 1) % n_queues
;
224 unsigned long rc
= dispatch_async(q
[bat
->at_q
], ^{ pass(q
, bat
, n_queues
, complete_q
); });
229 const int n_queues
= 128;
230 const int q_div_b
= 4;
231 const int n_batons
= n_queues
/ q_div_b
;
232 assert(q_div_b
* n_batons
== n_queues
);
233 n_baton_kludge
= n_batons
;
235 dispatch_queue_t complete_q
= dispatch_queue_create("completion q", NULL
);;
236 char *q_labels
[n_queues
];
240 // creting a queue ("C"), suspending it, blocking in a dispatch_sync, and
241 // having another queue resume C does not appear to ever unblock the
242 // dispatch_sync. XXX: make test case and file radar. (if it still
243 // works that way on recent builds, with dispatch inside libsystem, and
247 pthread_mutex_init(&kludge
, NULL
);
248 rc
= pthread_mutex_trylock(&kludge
);
250 q
= alloca(n_queues
* sizeof(dispatch_queue_t
));
252 for(i
= 0; i
< n_queues
; i
++) {
253 asprintf(q_labels
+ i
, "relay#%d (%s)", i
, __FUNCTION__
);
255 q
[i
] = dispatch_queue_create(q_labels
[i
], NULL
);
259 uint64_t start_time
= mach_absolute_time();
261 for(i
= 0; i
< n_queues
; i
+= q_div_b
) {
262 struct baton
*bat
= valloc(sizeof(struct baton
));
264 bat
->passes_left
= SMALL_LOOP
;
266 bat
->baton_number
= i
/ q_div_b
;
267 dispatch_queue_suspend(complete_q
);
268 rc
= dispatch_async(q
[i
], ^{
269 pass(q
, bat
, n_queues
, complete_q
);
274 // XXX: dispatch_sync(complete_q, ^{});
275 rc
= pthread_mutex_lock(&kludge
);
277 uint64_t end_time
= mach_absolute_time();
278 report(__FUNCTION__
, "baton pass", (end_time
- start_time
), SMALL_LOOP
*n_batons
, "mach");
279 // dispatch_queue_release(q);
282 void bench_overload2() {
283 const int n_queues
= 128;
284 const int q_div_b
= 1;
285 const int n_batons
= n_queues
/ q_div_b
;
286 n_baton_kludge
= n_batons
;
287 assert(q_div_b
* n_batons
== n_queues
);
288 dispatch_queue_t
*q
= alloca(n_queues
* sizeof(dispatch_queue_t
));
289 dispatch_source_t
*ds
= alloca(n_queues
* sizeof(dispatch_source_t
));
290 dispatch_queue_t complete_q
= dispatch_queue_create("completion q", NULL
);
291 __block
uint64_t start_time
= 0;
292 uint64_t time_to_start
;
294 char *q_labels
[n_queues
];
298 rc
= pthread_mutex_unlock(&kludge
);
300 rc
= pthread_mutex_trylock(&kludge
);
303 // Start all batons one to two seconds from now.
304 time_to_start
= (2 + time(NULL
)) * 1000000000;
306 for(i
= 0; i
< n_queues
; i
++) {
307 asprintf(q_labels
+ i
, "queue#%d (%s)", i
, __FUNCTION__
);
309 q
[i
] = dispatch_queue_create(q_labels
[i
], NULL
);
311 struct baton
*bat
= valloc(sizeof(struct baton
));
313 bat
->passes_left
= SMALL_LOOP
;
315 bat
->baton_number
= i
/ q_div_b
;
316 dispatch_queue_suspend(complete_q
);
317 ds
[i
] = dispatch_source_timer_create(DISPATCH_TIMER_ABSOLUTE
, time_to_start
, 0, NULL
, q
[i
], ^(dispatch_event_t event
){
318 assert(!dispatch_event_get_error(event
, NULL
));
319 // We want to measure the time from the first
320 // baton pass, and NOT include hte wait time
321 // for eveyone to start to fire
323 uint64_t s
= mach_absolute_time();
324 __sync_bool_compare_and_swap(&start_time
, 0, s
);
326 pass(q
, bat
, n_queues
, complete_q
);
331 // XXX: dispatch_sync(complete_q, ^{});
332 rc
= pthread_mutex_lock(&kludge
);
335 end_time
= mach_absolute_time();
336 report(__FUNCTION__
, "overload#2", (end_time
- start_time
), SMALL_LOOP
*n_batons
, "mach");
337 // Many releases and free()s
341 void bench_overload1() {
342 const int n_queues
= 128;
343 const int q_div_b
= 1;
344 const int n_batons
= n_queues
/ q_div_b
;
345 n_baton_kludge
= n_batons
;
346 assert(q_div_b
* n_batons
== n_queues
);
347 dispatch_queue_t
*q
= alloca(n_queues
* sizeof(dispatch_queue_t
));
348 dispatch_queue_t complete_q
= dispatch_queue_create("completion q", NULL
);
349 __block
uint64_t start_time
= 0;
350 struct timeval time_to_start
;
352 char *q_labels
[n_queues
];
356 rc
= pthread_mutex_unlock(&kludge
);
358 rc
= pthread_mutex_trylock(&kludge
);
361 // Start all batons one to two seconds from now.
362 gettimeofday(&time_to_start
, NULL
);
363 time_to_start
.tv_sec
+= 2;
365 for(i
= 0; i
< n_queues
; i
++) {
366 asprintf(q_labels
+ i
, "queue#%d (%s)", i
, __FUNCTION__
);
368 q
[i
] = dispatch_queue_create(q_labels
[i
], NULL
);
370 struct baton
*bat
= valloc(sizeof(struct baton
));
372 bat
->passes_left
= SMALL_LOOP
;
374 bat
->baton_number
= i
/ q_div_b
;
375 dispatch_queue_suspend(complete_q
);
376 dispatch_async(q
[i
], ^(void) {
378 gettimeofday(&now
, NULL
);
379 int sec
= time_to_start
.tv_sec
- now
.tv_sec
;
381 int usec
= time_to_start
.tv_usec
+ now
.tv_usec
;
382 if (usec
> 0 || sec
> 0) {
383 usleep(1000000 * sec
+ usec
);
389 // We want to measure the time from the first
390 // baton pass, and NOT include hte wait time
391 // for eveyone to start to fire
393 uint64_t s
= mach_absolute_time();
394 __sync_bool_compare_and_swap(&start_time
, 0, s
);
397 pass(q
, bat
, n_queues
, complete_q
);
401 // XXX: dispatch_sync(complete_q, ^{});
402 rc
= pthread_mutex_lock(&kludge
);
405 end_time
= mach_absolute_time();
406 report(__FUNCTION__
, "overload#1", (end_time
- start_time
), SMALL_LOOP
*n_batons
, "mach");
407 // Many releases and free()s
411 int main(int argc
, char *argv
[]) {
412 // Someday we will be able to take a list of tests to run, or exclude, or something.
414 // There are somewhat diffrent perfomance chararistics when using the
415 // main queue, so we use a "normal" queue for all our tests.
416 dispatch_queue_t bench_q
= dispatch_queue_create("benhmark Q", NULL
);
418 dispatch_async(bench_q
, ^{
419 // These two aren't as intresting in duel core, they queue all
420 // the calls before making them which isn't really what we
421 // want to test, is it? It also limites the number of loops
422 // we can spin around.
424 bench_message_round_trip();
425 bench_precopy_message_round_trip();
427 bench_message_round_type_syncasync();
428 bench_message_round_trip_f();
429 bench_message_round_type_syncasync_f();
435 // This leaks, so we run it last. Also it gives
436 // wrong results if stdio hasn't been started already,
437 // so we definitly don't want to run it first even if
438 // the leaks are fixed (or ignored)
439 bench_queue_mem_use();