testing/slice_benchmarks.c

   1 #include <libkern/OSAtomic.h>
   2 #include <sys/sysctl.h>
   3 #include <mach/mach.h>
   4 #include <mach/mach_time.h>
   5 #include <stdio.h>
   6 #include <stdlib.h>
   7 #include <stdint.h>
   8 #include <unistd.h>
   9 #include <assert.h>
  10 #include <errno.h>
  11 #include <pthread.h>
  12 #include <math.h>
  13 #include <libproc.h>
  14 #include <sys/proc_info.h>
  15 #include <dispatch/dispatch.h>
  16 // #include "../src/private.h"
  17 #include <Block.h>
  18
  19 // "normal" loop size
  20 #define LOOP 100000
  21 #define SMALL_LOOP 1000
  22
  23 void report(const char *func, char *full_name, double x, unsigned long loops, char *unit) {
  24     // XXX: make cols pretty & stuff
  25     const char *prefix = "bench_";
  26     const int plen = strlen(prefix);
  27     assert(!strncmp(func, prefix, plen));
  28     func += plen;
  29     char *name;
  30     asprintf(&name, "[%s] %s", func, full_name);
  31     assert(name);
  32
  33     x /= loops;
  34
  35     if (!strcmp("mach", unit)) {
  36         static mach_timebase_info_data_t mtb;
  37         if (!mtb.denom) {
  38             (void)mach_timebase_info(&mtb);
  39         }
  40         x = (x * mtb.numer) / mtb.denom;
  41         unit = "ns";
  42     }
  43
  44     printf("%-64s %13f%-2s\n", name, x, unit);
  45     free(name);
  46 }
  47
  48 void bench_queue_mem_use(void) {
  49     struct proc_taskinfo pti;
  50     uint64_t target_size;
  51
  52     // The 1st call eats a little memory that isn't accounted for
  53     // until the 2nd call.   Also the _first_ printf eats >1M, so
  54     // if you insert some for debugging make sure it isn't the first!
  55     proc_pidinfo(getpid(), PROC_PIDTASKINFO, 0, &pti, sizeof(pti));
  56     proc_pidinfo(getpid(), PROC_PIDTASKINFO, 0, &pti, sizeof(pti));
  57     target_size = pti.pti_virtual_size + 1024*1024;
  58     int n;
  59
  60     for(n = 0; target_size >= pti.pti_virtual_size; n++) {
  61         dispatch_queue_t leak = dispatch_queue_create("to be deleted", NULL);
  62         assert(leak);
  63         proc_pidinfo(getpid(), PROC_PIDTASKINFO, 0, &pti, sizeof(pti));
  64         //printf("pti_virtual_size %qd; togo %qd, n %d\n", pti.pti_virtual_size, target_size - pti.pti_virtual_size, n);
  65     }
  66
  67     report(__FUNCTION__, "#queues to grow VSIZE 1Mbyte", n-1, 1, "x");
  68 }
  69
  70 void bench_message_round_trip(void) {
  71     dispatch_queue_t q1 = dispatch_queue_create("q1", NULL);
  72     dispatch_queue_t q2 = dispatch_queue_create("q2", NULL);
  73     uint64_t start = mach_absolute_time();
  74
  75     int i;
  76     for(i = 0; i < LOOP; i++) {
  77         // make sure we don't build up too much of a backlog
  78         if (i && !(i & 0x3ff)) {
  79             dispatch_sync(q2, ^{});
  80         }
  81         dispatch_queue_retain(q2);
  82         dispatch_async(q1, ^{
  83             dispatch_async(q2, ^{
  84                 dispatch_queue_release(q2);
  85             });
  86         });
  87     }
  88
  89     // Make sure eveything has drained before we take the end timestamp
  90     dispatch_sync(q1, ^{});
  91     dispatch_sync(q2, ^{});
  92
  93     uint64_t end = mach_absolute_time();
  94     report(__FUNCTION__, "round trip (async async - implicit copy)", (end - start), LOOP, "mach");
  95     dispatch_queue_release(q1);
  96     dispatch_queue_release(q2);
  97 }
  98
  99 void bench_precopy_message_round_trip(void) {
 100     dispatch_queue_t q1 = dispatch_queue_create("q1", NULL);
 101     dispatch_queue_t q2 = dispatch_queue_create("q2", NULL);
 102     assert(q1 && q2);
 103
 104     unsigned long rc;
 105
 106     dispatch_block_t b2 = Block_copy(^{
 107     });
 108     dispatch_block_t b1 = Block_copy(^{
 109         unsigned long rc = dispatch_async(q2, b2);
 110         assert(!rc);
 111         dispatch_queue_release(q2);
 112     });
 113     dispatch_block_t be = Block_copy(^{});
 114     assert(b1 && b2);
 115     uint64_t start = mach_absolute_time();
 116
 117     int i;
 118     for(i = 0; i < LOOP; i++) {
 119         // make sure we don't build up too much of a backlog
 120         if (i && !(i & 0x3ff)) {
 121             dispatch_sync(q2, be);
 122         }
 123         dispatch_queue_retain(q2);
 124         rc = dispatch_async(q1, b1);
 125         assert(!rc);
 126     }
 127
 128     // Make sure eveything has drained before we take the end timestamp
 129     dispatch_sync(q1, be);
 130     dispatch_sync(q2, be);
 131
 132     uint64_t end = mach_absolute_time();
 133     report(__FUNCTION__, "round trip (a/a - precopy)", (end - start), LOOP, "mach");
 134     dispatch_queue_release(q1);
 135     dispatch_queue_release(q2);
 136 }
 137
 138 void bench_message_round_type_syncasync(void) {
 139     dispatch_queue_t q1 = dispatch_queue_create("q1", NULL);
 140     dispatch_queue_t q2 = dispatch_queue_create("q2", NULL);
 141     uint64_t start = mach_absolute_time();
 142
 143     int i;
 144     for(i = 0; i < LOOP; i++) {
 145         dispatch_queue_retain(q2);
 146         dispatch_sync(q1, ^{
 147             dispatch_async(q2, ^{
 148                 dispatch_queue_release(q2);
 149             });
 150         });
 151     }
 152
 153     // Make sure eveything has drained before we take the end timestamp
 154     dispatch_sync(q1, ^{});
 155     dispatch_sync(q2, ^{});
 156
 157     uint64_t end = mach_absolute_time();
 158     report(__FUNCTION__, "round trip (s/a - implicit copy)", (end - start), LOOP, "mach");
 159     dispatch_queue_release(q1);
 160     dispatch_queue_release(q2);
 161 }
 162
 163 void nothing_f(void *ignored) {
 164 }
 165
 166 void brt_f_q1(void *vq2) {
 167     unsigned long rc = dispatch_async_f((dispatch_queue_t)vq2, NULL, nothing_f);
 168     assert(!rc);
 169 }
 170
 171 void bench_message_round_trip_f(void) {
 172     dispatch_queue_t q1 = dispatch_queue_create("q1", NULL);
 173     dispatch_queue_t q2 = dispatch_queue_create("q2", NULL);
 174     uint64_t start = mach_absolute_time();
 175     unsigned long rc;
 176
 177     int i;
 178     for(i = 0; i < LOOP; i++) {
 179         // make sure we don't build up too much of a backlog
 180         if (i && !(i & 0x3ff)) {
 181             dispatch_sync_f(q2, NULL, nothing_f);
 182         }
 183         rc = dispatch_async_f(q1, q2, brt_f_q1);
 184         assert(!rc);
 185     }
 186
 187     // Make sure eveything has drained before we take the end timestamp
 188     dispatch_sync_f(q1, NULL, nothing_f);
 189     dispatch_sync_f(q2, NULL, nothing_f);
 190
 191     uint64_t end = mach_absolute_time();
 192     report(__FUNCTION__, "round trip (a/a - no blocks)", (end - start), LOOP, "mach");
 193     dispatch_queue_release(q1);
 194     dispatch_queue_release(q2);
 195 }
 196
 197 void bench_message_round_type_syncasync_f(void) {
 198 }
 199
 200 struct baton {
 201     // should extend to keep data on times for latency calc
 202     int passes_left;
 203     int at_q;
 204     int baton_number;
 205
 206     // Avoid false ache line shares.   Big speed difference on a Mac Pro
 207     char pad[128 - sizeof(int)*3];
 208 };
 209
 210 pthread_mutex_t kludge;
 211 static int n_baton_kludge;
 212
 213 void pass(dispatch_queue_t *q, struct baton *bat, const int n_queues, dispatch_queue_t complete_q) {
 214     //fprintf(stderr, "bat#%d q#%d, passes left: %d\n", bat->baton_number, bat->at_q, bat->baton_number);
 215     if (0 == --(bat->passes_left)) {
 216         dispatch_queue_resume(complete_q);
 217         // XXX: atomic
 218         if (!__sync_sub_and_fetch(&n_baton_kludge, 1)) {
 219                 pthread_mutex_unlock(&kludge);
 220         }
 221         return;
 222     }
 223     bat->at_q = (bat->at_q + 1) % n_queues;
 224     unsigned long rc = dispatch_async(q[bat->at_q], ^{ pass(q, bat, n_queues, complete_q); });
 225     assert(rc == 0);
 226 }
 227
 228 void bench_baton() {
 229     const int n_queues = 128;
 230     const int q_div_b = 4;
 231     const int n_batons = n_queues / q_div_b;
 232     assert(q_div_b * n_batons == n_queues);
 233     n_baton_kludge = n_batons;
 234     dispatch_queue_t *q;
 235     dispatch_queue_t complete_q = dispatch_queue_create("completion q", NULL);;
 236     char *q_labels[n_queues];
 237     int i;
 238     unsigned long rc;
 239
 240     // creting a queue ("C"), suspending it, blocking in a dispatch_sync, and
 241     // having another queue resume C does not appear to ever unblock the
 242     // dispatch_sync.   XXX: make test case and file radar.   (if it still
 243     // works that way on recent builds, with dispatch inside libsystem, and
 244     // such)
 245
 246
 247     pthread_mutex_init(&kludge, NULL);
 248     rc = pthread_mutex_trylock(&kludge);
 249     assert(!rc);
 250     q = alloca(n_queues * sizeof(dispatch_queue_t));
 251
 252     for(i = 0; i < n_queues; i++) {
 253         asprintf(q_labels + i, "relay#%d (%s)", i, __FUNCTION__);
 254         assert(q_labels[i]);
 255         q[i] = dispatch_queue_create(q_labels[i], NULL);
 256         assert(q[i]);
 257     }
 258
 259     uint64_t start_time = mach_absolute_time();
 260
 261     for(i = 0; i < n_queues; i += q_div_b) {
 262         struct baton *bat = valloc(sizeof(struct baton));
 263         assert(bat);
 264         bat->passes_left = SMALL_LOOP;
 265         bat->at_q = i;
 266         bat->baton_number = i / q_div_b;
 267         dispatch_queue_suspend(complete_q);
 268         rc = dispatch_async(q[i], ^{
 269             pass(q, bat, n_queues, complete_q);
 270         });
 271         assert(rc == 0);
 272     }
 273
 274     // XXX: dispatch_sync(complete_q, ^{});
 275     rc = pthread_mutex_lock(&kludge);
 276     assert(!rc);
 277     uint64_t end_time = mach_absolute_time();
 278     report(__FUNCTION__, "baton pass", (end_time - start_time), SMALL_LOOP*n_batons, "mach");
 279     // dispatch_queue_release(q);
 280 }
 281
 282 void bench_overload2() {
 283         const int n_queues = 128;
 284         const int q_div_b = 1;
 285         const int n_batons = n_queues / q_div_b;
 286         n_baton_kludge = n_batons;
 287         assert(q_div_b * n_batons == n_queues);
 288         dispatch_queue_t *q = alloca(n_queues * sizeof(dispatch_queue_t));
 289         dispatch_source_t *ds = alloca(n_queues * sizeof(dispatch_source_t));
 290         dispatch_queue_t complete_q = dispatch_queue_create("completion q", NULL);
 291         __block uint64_t start_time = 0;
 292         uint64_t time_to_start;
 293         uint64_t end_time;
 294         char *q_labels[n_queues];
 295         int i;
 296         unsigned int rc;
 297
 298         rc = pthread_mutex_unlock(&kludge);
 299         assert(!rc);
 300         rc = pthread_mutex_trylock(&kludge);
 301         assert(!rc);
 302
 303         // Start all batons one to two seconds from now.
 304         time_to_start = (2 + time(NULL)) * 1000000000;
 305
 306         for(i = 0; i < n_queues; i++) {
 307                 asprintf(q_labels + i, "queue#%d (%s)", i, __FUNCTION__);
 308                 assert(q_labels[i]);
 309                 q[i] = dispatch_queue_create(q_labels[i], NULL);
 310                 assert(q[i]);
 311                 struct baton *bat = valloc(sizeof(struct baton));
 312                 assert(bat);
 313                 bat->passes_left = SMALL_LOOP;
 314                 bat->at_q = i;
 315                 bat->baton_number = i / q_div_b;
 316                 dispatch_queue_suspend(complete_q);
 317                 ds[i] = dispatch_source_timer_create(DISPATCH_TIMER_ABSOLUTE, time_to_start, 0, NULL, q[i], ^(dispatch_event_t event){
 318                         assert(!dispatch_event_get_error(event, NULL));
 319                         // We want to measure the time from the first
 320                         // baton pass, and NOT include hte wait time
 321                         // for eveyone to start to fire
 322                         if (!start_time) {
 323                                 uint64_t s = mach_absolute_time();
 324                                 __sync_bool_compare_and_swap(&start_time, 0, s);
 325                         }
 326                         pass(q, bat, n_queues, complete_q);
 327                 });
 328                 assert(ds[i]);
 329         }
 330
 331         // XXX: dispatch_sync(complete_q, ^{});
 332         rc = pthread_mutex_lock(&kludge);
 333         assert(!rc);
 334
 335         end_time = mach_absolute_time();
 336         report(__FUNCTION__, "overload#2", (end_time - start_time), SMALL_LOOP*n_batons, "mach");
 337         // Many releases and free()s
 338
 339 }
 340
 341 void bench_overload1() {
 342         const int n_queues = 128;
 343         const int q_div_b = 1;
 344         const int n_batons = n_queues / q_div_b;
 345         n_baton_kludge = n_batons;
 346         assert(q_div_b * n_batons == n_queues);
 347         dispatch_queue_t *q = alloca(n_queues * sizeof(dispatch_queue_t));
 348         dispatch_queue_t complete_q = dispatch_queue_create("completion q", NULL);
 349         __block uint64_t start_time = 0;
 350         struct timeval time_to_start;
 351         uint64_t end_time;
 352         char *q_labels[n_queues];
 353         int i;
 354         unsigned int rc;
 355
 356         rc = pthread_mutex_unlock(&kludge);
 357         assert(!rc);
 358         rc = pthread_mutex_trylock(&kludge);
 359         assert(!rc);
 360
 361         // Start all batons one to two seconds from now.
 362         gettimeofday(&time_to_start, NULL);
 363         time_to_start.tv_sec += 2;
 364
 365         for(i = 0; i < n_queues; i++) {
 366                 asprintf(q_labels + i, "queue#%d (%s)", i, __FUNCTION__);
 367                 assert(q_labels[i]);
 368                 q[i] = dispatch_queue_create(q_labels[i], NULL);
 369                 assert(q[i]);
 370                 struct baton *bat = valloc(sizeof(struct baton));
 371                 assert(bat);
 372                 bat->passes_left = SMALL_LOOP;
 373                 bat->at_q = i;
 374                 bat->baton_number = i / q_div_b;
 375                 dispatch_queue_suspend(complete_q);
 376                 dispatch_async(q[i], ^(void) {
 377                         struct timeval now;
 378                         gettimeofday(&now, NULL);
 379                         int sec = time_to_start.tv_sec - now.tv_sec;
 380                         if (sec >= 0) {
 381                                 int usec = time_to_start.tv_usec + now.tv_usec;
 382                                 if (usec > 0 || sec > 0) {
 383                                         usleep(1000000 * sec + usec);
 384                                 } else {
 385                                         // XXX: log here
 386                                 }
 387                         }
 388
 389                         // We want to measure the time from the first
 390                         // baton pass, and NOT include hte wait time
 391                         // for eveyone to start to fire
 392                         if (!start_time) {
 393                                 uint64_t s = mach_absolute_time();
 394                                 __sync_bool_compare_and_swap(&start_time, 0, s);
 395                         }
 396
 397                         pass(q, bat, n_queues, complete_q);
 398                 });
 399         }
 400
 401         // XXX: dispatch_sync(complete_q, ^{});
 402         rc = pthread_mutex_lock(&kludge);
 403         assert(!rc);
 404
 405         end_time = mach_absolute_time();
 406         report(__FUNCTION__, "overload#1", (end_time - start_time), SMALL_LOOP*n_batons, "mach");
 407         // Many releases and free()s
 408
 409 }
 410
 411 int main(int argc, char *argv[]) {
 412     // Someday we will be able to take a list of tests to run, or exclude, or something.
 413
 414     // There are somewhat diffrent perfomance chararistics when using the
 415     // main queue, so we use a "normal" queue for all our tests.
 416     dispatch_queue_t bench_q = dispatch_queue_create("benhmark Q", NULL);
 417
 418     dispatch_async(bench_q, ^{
 419         // These two aren't as intresting in duel core, they queue all
 420         // the calls before making them which isn't really what we
 421         // want to test, is it?     It also limites the number of loops
 422         // we can spin around.
 423 #if 1
 424         bench_message_round_trip();
 425         bench_precopy_message_round_trip();
 426
 427         bench_message_round_type_syncasync();
 428         bench_message_round_trip_f();
 429         bench_message_round_type_syncasync_f();
 430 #endif
 431         bench_baton();
 432         bench_overload1();
 433         bench_overload2();
 434
 435         // This leaks, so we run it last.  Also it gives
 436         // wrong results if stdio hasn't been started already,
 437         // so we definitly don't want to run it first even if
 438         // the leaks are fixed (or ignored)
 439         bench_queue_mem_use();
 440
 441         exit(0);
 442     });
 443
 444     dispatch_main();
 445 }