]> git.saurik.com Git - apple/xnu.git/blob - tools/tests/libMicro/libmicro.c
xnu-1504.3.12.tar.gz
[apple/xnu.git] / tools / tests / libMicro / libmicro.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms
5 * of the Common Development and Distribution License
6 * (the "License"). You may not use this file except
7 * in compliance with the License.
8 *
9 * You can obtain a copy of the license at
10 * src/OPENSOLARIS.LICENSE
11 * or http://www.opensolaris.org/os/licensing.
12 * See the License for the specific language governing
13 * permissions and limitations under the License.
14 *
15 * When distributing Covered Code, include this CDDL
16 * HEADER in each file and include the License file at
17 * usr/src/OPENSOLARIS.LICENSE. If applicable,
18 * add the following below this CDDL HEADER, with the
19 * fields enclosed by brackets "[]" replaced with your
20 * own identifying information: Portions Copyright [yyyy]
21 * [name of copyright owner]
22 *
23 * CDDL HEADER END
24 */
25
26 /*
27 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
29 */
30
31 /*
32 * benchmarking routines
33 */
34
35 #include <sys/types.h>
36 #include <sys/time.h>
37 #include <sys/ipc.h>
38 #include <sys/sem.h>
39 #include <sys/mman.h>
40 #include <sys/wait.h>
41 #include <ctype.h>
42 #include <string.h>
43 #include <strings.h>
44 #include <signal.h>
45 #include <stdio.h>
46 #include <unistd.h>
47 #include <stdlib.h>
48 #include <poll.h>
49 #include <pthread.h>
50 #include <dlfcn.h>
51 #include <errno.h>
52 #include <sys/resource.h>
53 #include <math.h>
54 #include <limits.h>
55
56 #ifdef __sun
57 #include <sys/elf.h>
58 #endif
59
60 #include "libmicro.h"
61
62
63 #if defined(__APPLE__)
64 #include <mach/mach_time.h>
65
66 long long
67 gethrtime(void)
68 {
69 long long elapsed;
70 static long long start;
71 static mach_timebase_info_data_t sTimebaseInfo = { 0, 0 };
72
73 // If this is the first time we've run, get the timebase.
74 // We can use denom == 0 to indicate that sTimebaseInfo is
75 // uninitialised because it makes no sense to have a zero
76 // denominator in a fraction.
77
78 if ( sTimebaseInfo.denom == 0 ) {
79 (void) mach_timebase_info(&sTimebaseInfo);
80 start = mach_absolute_time();
81 }
82
83 elapsed = mach_absolute_time() - start;
84
85 // Convert to nanoseconds.
86 // return (elapsed * (long long)sTimebaseInfo.numer)/(long long)sTimebaseInfo.denom;
87
88 // Provided the final result is representable in 64 bits the following maneuver will
89 // deliver that result without intermediate overflow.
90 if (sTimebaseInfo.denom == sTimebaseInfo.numer)
91 return elapsed;
92 else if (sTimebaseInfo.denom == 1)
93 return elapsed * (long long)sTimebaseInfo.numer;
94 else {
95 // Decompose elapsed = eta32 * 2^32 + eps32:
96 long long eta32 = elapsed >> 32;
97 long long eps32 = elapsed & 0x00000000ffffffffLL;
98
99 long long numer = sTimebaseInfo.numer, denom = sTimebaseInfo.denom;
100
101 // Form product of elapsed64 (decomposed) and numer:
102 long long mu64 = numer * eta32;
103 long long lambda64 = numer * eps32;
104
105 // Divide the constituents by denom:
106 long long q32 = mu64/denom;
107 long long r32 = mu64 - (q32 * denom); // mu64 % denom
108
109 return (q32 << 32) + ((r32 << 32) + lambda64)/denom;
110 }
111 }
112
113 #endif
114
115 /*
116 * user visible globals
117 */
118
119 int lm_argc = 0;
120 char ** lm_argv = NULL;
121
122 int lm_opt1;
123 int lm_optA;
124 int lm_optB;
125 int lm_optC = 100;
126 int lm_optD;
127 int lm_optE;
128 int lm_optH;
129 int lm_optI;
130 int lm_optL = 0;
131 int lm_optM = 0;
132 char *lm_optN;
133 int lm_optP;
134 int lm_optS;
135 int lm_optT;
136 int lm_optW;
137
138 int lm_def1 = 0;
139 int lm_defB = 0; /* use lm_nsecs_per_op */
140 int lm_defD = 10;
141 int lm_defH = 0;
142 char *lm_defN = NULL;
143 int lm_defP = 1;
144
145 int lm_defS = 0;
146 int lm_defT = 1;
147
148 /*
149 * default on fast platform, should be overridden by individual
150 * benchmarks if significantly wrong in either direction.
151 */
152
153 int lm_nsecs_per_op = 5;
154
155 char *lm_procpath;
156 char lm_procname[STRSIZE];
157 char lm_usage[STRSIZE];
158 char lm_optstr[STRSIZE];
159 char lm_header[STRSIZE];
160 size_t lm_tsdsize = 0;
161
162
163 /*
164 * Globals we do not export to the user
165 */
166
167 static barrier_t *lm_barrier;
168 static pid_t *pids = NULL;
169 static pthread_t *tids = NULL;
170 static int pindex = -1;
171 static void *tsdseg = NULL;
172 static size_t tsdsize = 0;
173
174 #ifdef USE_RDTSC
175 static long long lm_hz = 0;
176 #endif
177
178
179 /*
180 * Forward references
181 */
182
183 static void worker_process();
184 static void usage();
185 static void print_stats(barrier_t *);
186 static void print_histo(barrier_t *);
187 static int remove_outliers(double *, int, stats_t *);
188 static long long nsecs_overhead;
189 static long long nsecs_resolution;
190 static long long get_nsecs_overhead();
191 static int crunch_stats(double *, int, stats_t *);
192 static void compute_stats(barrier_t *);
193 /*
194 * main routine; renamed in this file to allow linking with other
195 * files
196 */
197
198 int
199 actual_main(int argc, char *argv[])
200 {
201 int i;
202 int opt;
203 extern char *optarg;
204 char *tmp;
205 char optstr[256];
206 barrier_t *b;
207 long long startnsecs = getnsecs();
208
209 #ifdef USE_RDTSC
210 if (getenv("LIBMICRO_HZ") == NULL) {
211 (void) printf("LIBMICRO_HZ needed but not set\n");
212 exit(1);
213 }
214 lm_hz = strtoll(getenv("LIBMICRO_HZ"), NULL, 10);
215 #endif
216
217 lm_argc = argc;
218 lm_argv = argv;
219
220 /* before we do anything */
221 (void) benchmark_init();
222
223
224 nsecs_overhead = get_nsecs_overhead();
225 nsecs_resolution = get_nsecs_resolution();
226
227 /*
228 * Set defaults
229 */
230
231 lm_opt1 = lm_def1;
232 lm_optB = lm_defB;
233 lm_optD = lm_defD;
234 lm_optH = lm_defH;
235 lm_optN = lm_defN;
236 lm_optP = lm_defP;
237
238 lm_optS = lm_defS;
239 lm_optT = lm_defT;
240
241 /*
242 * squirrel away the path to the current
243 * binary in a way that works on both
244 * Linux and Solaris
245 */
246
247 if (*argv[0] == '/') {
248 lm_procpath = strdup(argv[0]);
249 *strrchr(lm_procpath, '/') = 0;
250 } else {
251 char path[1024];
252 (void) getcwd(path, 1024);
253 (void) strcat(path, "/");
254 (void) strcat(path, argv[0]);
255 *strrchr(path, '/') = 0;
256 lm_procpath = strdup(path);
257 }
258
259 /*
260 * name of binary
261 */
262
263 if ((tmp = strrchr(argv[0], '/')) == NULL)
264 (void) strcpy(lm_procname, argv[0]);
265 else
266 (void) strcpy(lm_procname, tmp + 1);
267
268 if (lm_optN == NULL) {
269 lm_optN = lm_procname;
270 }
271
272 /*
273 * Parse command line arguments
274 */
275
276 (void) sprintf(optstr, "1AB:C:D:EHI:LMN:P:RST:VW?%s", lm_optstr);
277 while ((opt = getopt(argc, argv, optstr)) != -1) {
278 switch (opt) {
279 case '1':
280 lm_opt1 = 1;
281 break;
282 case 'A':
283 lm_optA = 1;
284 break;
285 case 'B':
286 lm_optB = sizetoint(optarg);
287 break;
288 case 'C':
289 lm_optC = sizetoint(optarg);
290 break;
291 case 'D':
292 lm_optD = sizetoint(optarg);
293 break;
294 case 'E':
295 lm_optE = 1;
296 break;
297 case 'H':
298 lm_optH = 1;
299 break;
300 case 'I':
301 lm_optI = sizetoint(optarg);
302 break;
303 case 'L':
304 lm_optL = 1;
305 break;
306 case 'M':
307 lm_optM = 1;
308 break;
309 case 'N':
310 lm_optN = optarg;
311 break;
312 case 'P':
313 lm_optP = sizetoint(optarg);
314 break;
315 case 'S':
316 lm_optS = 1;
317 break;
318 case 'T':
319 lm_optT = sizetoint(optarg);
320 break;
321 case 'V':
322 (void) printf("%s\n", LIBMICRO_VERSION);
323 exit(0);
324 break;
325 case 'W':
326 lm_optW = 1;
327 lm_optS = 1;
328 break;
329 case '?':
330 usage();
331 exit(0);
332 break;
333 default:
334 if (benchmark_optswitch(opt, optarg) == -1) {
335 usage();
336 exit(0);
337 }
338 }
339 }
340
341 /* deal with implicit and overriding options */
342 if (lm_opt1 && lm_optP > 1) {
343 lm_optP = 1;
344 (void) printf("warning: -1 overrides -P\n");
345 }
346
347 if (lm_optE) {
348 (void) fprintf(stderr, "Running:%20s", lm_optN);
349 (void) fflush(stderr);
350 }
351
352 if (lm_optB == 0) {
353 /*
354 * neither benchmark or user has specified the number
355 * of cnts/sample, so use computed value
356 */
357 if (lm_optI)
358 lm_nsecs_per_op = lm_optI;
359 #define BLOCK_TOCK_DURATION 10000 /* number of raw timer "tocks" ideally comprising a block of work */
360 lm_optB = nsecs_resolution * BLOCK_TOCK_DURATION / lm_nsecs_per_op;
361 if (lm_optB == 0)
362 lm_optB = 1;
363 }
364
365 /*
366 * now that the options are set
367 */
368
369 if (benchmark_initrun() == -1) {
370 exit(1);
371 }
372
373 /* allocate dynamic data */
374 pids = (pid_t *)malloc(lm_optP * sizeof (pid_t));
375 if (pids == NULL) {
376 perror("malloc(pids)");
377 exit(1);
378 }
379 tids = (pthread_t *)malloc(lm_optT * sizeof (pthread_t));
380 if (tids == NULL) {
381 perror("malloc(tids)");
382 exit(1);
383 }
384
385 /* check that the case defines lm_tsdsize before proceeding */
386 if (lm_tsdsize == (size_t)-1) {
387 (void) fprintf(stderr, "error in benchmark_init: "
388 "lm_tsdsize not set\n");
389 exit(1);
390 }
391
392 /* round up tsdsize to nearest 128 to eliminate false sharing */
393 tsdsize = ((lm_tsdsize + 127) / 128) * 128;
394
395 /* allocate sufficient TSD for each thread in each process */
396 tsdseg = (void *)mmap(NULL, lm_optT * lm_optP * tsdsize + 8192,
397 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0L);
398 if (tsdseg == NULL) {
399 perror("mmap(tsd)");
400 exit(1);
401 }
402
403 /* initialise worker synchronisation */
404 b = barrier_create(lm_optT * lm_optP, DATASIZE);
405 if (b == NULL) {
406 perror("barrier_create()");
407 exit(1);
408 }
409 lm_barrier = b;
410 b->ba_flag = 1;
411
412 /* need this here so that parent and children can call exit() */
413 (void) fflush(stdout);
414 (void) fflush(stderr);
415
416 /* when we started and when to stop */
417
418 b->ba_starttime = getnsecs();
419 b->ba_deadline = (long long) (b->ba_starttime + (lm_optD * 1000000LL));
420
421 /* do the work */
422 if (lm_opt1) {
423 /* single process, non-fork mode */
424 pindex = 0;
425 worker_process();
426 } else {
427 /* create worker processes */
428 for (i = 0; i < lm_optP; i++) {
429 pids[i] = fork();
430
431 switch (pids[i]) {
432 case 0:
433 pindex = i;
434 worker_process();
435 exit(0);
436 break;
437 case -1:
438 perror("fork");
439 exit(1);
440 break;
441 default:
442 continue;
443 }
444 }
445
446 /* wait for worker processes */
447 for (i = 0; i < lm_optP; i++) {
448 if (pids[i] > 0) {
449 (void) waitpid(pids[i], NULL, 0);
450 }
451 }
452 }
453
454 b->ba_endtime = getnsecs();
455
456 /* compute results */
457
458 compute_stats(b);
459
460 /* print arguments benchmark was invoked with ? */
461 if (lm_optL) {
462 int l;
463 (void) printf("# %s ", argv[0]);
464 for (l = 1; l < argc; l++) {
465 (void) printf("%s ", argv[l]);
466 }
467 (void) printf("\n");
468 }
469
470 /* print result header (unless suppressed) */
471 if (!lm_optH) {
472 (void) printf("%12s %3s %3s %12s %12s %8s %8s %s\n",
473 "", "prc", "thr",
474 "usecs/call",
475 "samples", "errors", "cnt/samp", lm_header);
476 }
477
478 /* print result */
479
480 (void) printf("%-12s %3d %3d %12.5f %12d %8lld %8d %s\n",
481 lm_optN, lm_optP, lm_optT,
482 (lm_optM?b->ba_corrected.st_mean:b->ba_corrected.st_median),
483 b->ba_batches, b->ba_errors, lm_optB,
484 benchmark_result());
485
486 if (lm_optS) {
487 print_stats(b);
488 }
489
490 /* just incase something goes awry */
491 (void) fflush(stdout);
492 (void) fflush(stderr);
493
494 /* cleanup by stages */
495 (void) benchmark_finirun();
496 (void) barrier_destroy(b);
497 (void) benchmark_fini();
498
499 if (lm_optE) {
500 (void) fprintf(stderr, " for %12.5f seconds\n",
501 (double)(getnsecs() - startnsecs) /
502 1.e9);
503 (void) fflush(stderr);
504 }
505 return (0);
506 }
507
508 void *
509 worker_thread(void *arg)
510 {
511 result_t r;
512 long long last_sleep = 0;
513 long long t;
514
515 r.re_errors = benchmark_initworker(arg);
516
517 while (lm_barrier->ba_flag) {
518 r.re_count = 0;
519 r.re_errors += benchmark_initbatch(arg);
520
521 /* sync to clock */
522
523 if (lm_optA && ((t = getnsecs()) - last_sleep) > 75000000LL) {
524 (void) poll(0, 0, 10);
525 last_sleep = t;
526 }
527 /* wait for it ... */
528 (void) barrier_queue(lm_barrier, NULL);
529
530 /* time the test */
531 r.re_t0 = getnsecs();
532 (void) benchmark(arg, &r);
533 r.re_t1 = getnsecs();
534
535 /* time to stop? */
536 if (r.re_t1 > lm_barrier->ba_deadline &&
537 (!lm_optC || lm_optC < lm_barrier->ba_batches)) {
538 lm_barrier->ba_flag = 0;
539 }
540
541 /* record results and sync */
542 (void) barrier_queue(lm_barrier, &r);
543
544 (void) benchmark_finibatch(arg);
545
546 r.re_errors = 0;
547 }
548
549 (void) benchmark_finiworker(arg);
550
551 return (0);
552 }
553
554 void
555 worker_process()
556 {
557 int i;
558 void *tsd;
559
560 for (i = 1; i < lm_optT; i++) {
561 tsd = gettsd(pindex, i);
562 if (pthread_create(&tids[i], NULL, worker_thread, tsd) != 0) {
563 perror("pthread_create");
564 exit(1);
565 }
566 }
567
568 tsd = gettsd(pindex, 0);
569 (void) worker_thread(tsd);
570
571 for (i = 1; i < lm_optT; i++) {
572 (void) pthread_join(tids[i], NULL);
573 }
574 }
575
576 void
577 usage()
578 {
579 (void) printf(
580 "usage: %s\n"
581 " [-1] (single process; overrides -P > 1)\n"
582 " [-A] (align with clock)\n"
583 " [-B batch-size (default %d)]\n"
584 " [-C minimum number of samples (default 0)]\n"
585 " [-D duration in msecs (default %ds)]\n"
586 " [-E (echo name to stderr)]\n"
587 " [-H] (suppress headers)\n"
588 " [-I] nsecs per op (used to compute batch size)"
589 " [-L] (print argument line)\n"
590 " [-M] (reports mean rather than median)\n"
591 " [-N test-name (default '%s')]\n"
592 " [-P processes (default %d)]\n"
593 " [-S] (print detailed stats)\n"
594 " [-T threads (default %d)]\n"
595 " [-V] (print the libMicro version and exit)\n"
596 " [-W] (flag possible benchmark problems)\n"
597 "%s\n",
598 lm_procname,
599 lm_defB, lm_defD, lm_procname, lm_defP, lm_defT,
600 lm_usage);
601 }
602
603 void
604 print_warnings(barrier_t *b)
605 {
606 int head = 0;
607 int increase;
608
609 if (b->ba_quant) {
610 if (!head++) {
611 (void) printf("#\n# WARNINGS\n");
612 }
613 increase = (int)(floor((nsecs_resolution * 100.0) /
614 ((double)lm_optB * b->ba_corrected.st_median * 1000.0)) +
615 1.0);
616 (void) printf("# Quantization error likely;"
617 "increase batch size (-B option) %dX to avoid.\n",
618 increase);
619 }
620
621 /*
622 * XXX should warn on median != mean by a lot
623 */
624
625 if (b->ba_errors) {
626 if (!head++) {
627 (void) printf("#\n# WARNINGS\n");
628 }
629 (void) printf("# Errors occured during benchmark.\n");
630 }
631 }
632
633 void
634 print_stats(barrier_t *b)
635 {
636 (void) printf("#\n");
637 (void) printf("# STATISTICS %12s %12s\n",
638 "usecs/call (raw)",
639 "usecs/call (outliers removed)");
640
641 if (b->ba_count == 0) {
642 (void) printf("zero samples\n");
643 return;
644 }
645
646 (void) printf("# min %12.5f %12.5f\n",
647 b->ba_raw.st_min,
648 b->ba_corrected.st_min);
649
650 (void) printf("# max %12.5f %12.5f\n",
651 b->ba_raw.st_max,
652 b->ba_corrected.st_max);
653 (void) printf("# mean %12.5f %12.5f\n",
654 b->ba_raw.st_mean,
655 b->ba_corrected.st_mean);
656 (void) printf("# median %12.5f %12.5f\n",
657 b->ba_raw.st_median,
658 b->ba_corrected.st_median);
659 (void) printf("# stddev %12.5f %12.5f\n",
660 b->ba_raw.st_stddev,
661 b->ba_corrected.st_stddev);
662 (void) printf("# standard error %12.5f %12.5f\n",
663 b->ba_raw.st_stderr,
664 b->ba_corrected.st_stderr);
665 (void) printf("# 99%% confidence level %12.5f %12.5f\n",
666 b->ba_raw.st_99confidence,
667 b->ba_corrected.st_99confidence);
668 (void) printf("# skew %12.5f %12.5f\n",
669 b->ba_raw.st_skew,
670 b->ba_corrected.st_skew);
671 (void) printf("# kurtosis %12.5f %12.5f\n",
672 b->ba_raw.st_kurtosis,
673 b->ba_corrected.st_kurtosis);
674
675 (void) printf("# time correlation %12.5f %12.5f\n",
676 b->ba_raw.st_timecorr,
677 b->ba_corrected.st_timecorr);
678 (void) printf("#\n");
679
680 (void) printf("# elasped time %12.5f\n", (b->ba_endtime -
681 b->ba_starttime) / 1.0e9);
682 (void) printf("# number of samples %12d\n", b->ba_batches);
683 (void) printf("# number of outliers %12d\n", b->ba_outliers);
684 (void) printf("# getnsecs overhead %12d\n", (int)nsecs_overhead);
685
686 (void) printf("#\n");
687 (void) printf("# DISTRIBUTION\n");
688
689 print_histo(b);
690
691 if (lm_optW) {
692 print_warnings(b);
693 }
694 }
695
696 void
697 update_stats(barrier_t *b, result_t *r)
698 {
699 double time;
700 double nsecs_per_call;
701
702 if (b->ba_waiters == 0) {
703 /* first thread only */
704 b->ba_t0 = r->re_t0;
705 b->ba_t1 = r->re_t1;
706 b->ba_count0 = 0;
707 b->ba_errors0 = 0;
708 } else {
709 /* all but first thread */
710 if (r->re_t0 < b->ba_t0) {
711 b->ba_t0 = r->re_t0;
712 }
713 if (r->re_t1 > b->ba_t1) {
714 b->ba_t1 = r->re_t1;
715 }
716 }
717
718 b->ba_count0 += r->re_count;
719 b->ba_errors0 += r->re_errors;
720
721 if (b->ba_waiters == b->ba_hwm - 1) {
722 /* last thread only */
723
724
725 time = (double)b->ba_t1 - (double)b->ba_t0 -
726 (double)nsecs_overhead;
727
728 if (time < 100 * nsecs_resolution)
729 b->ba_quant++;
730
731 /*
732 * normalize by procs * threads if not -U
733 */
734
735 nsecs_per_call = time / (double)b->ba_count0 *
736 (double)(lm_optT * lm_optP);
737
738 b->ba_count += b->ba_count0;
739 b->ba_errors += b->ba_errors0;
740
741 b->ba_data[b->ba_batches % b->ba_datasize] =
742 nsecs_per_call;
743
744 b->ba_batches++;
745 }
746 }
747
748 #ifdef USE_SEMOP
749 barrier_t *
750 barrier_create(int hwm, int datasize)
751 {
752 struct sembuf s[1];
753 barrier_t *b;
754
755 /*LINTED*/
756 b = (barrier_t *)mmap(NULL,
757 sizeof (barrier_t) + (datasize - 1) * sizeof (double),
758 PROT_READ | PROT_WRITE,
759 MAP_SHARED | MAP_ANON, -1, 0L);
760 if (b == (barrier_t *)MAP_FAILED) {
761 return (NULL);
762 }
763 b->ba_datasize = datasize;
764
765 b->ba_flag = 0;
766 b->ba_hwm = hwm;
767 b->ba_semid = semget(IPC_PRIVATE, 3, 0600);
768 if (b->ba_semid == -1) {
769 (void) munmap((void *)b, sizeof (barrier_t));
770 return (NULL);
771 }
772
773 /* [hwm - 1, 0, 0] */
774 s[0].sem_num = 0;
775 s[0].sem_op = hwm - 1;
776 s[0].sem_flg = 0;
777 if (semop(b->ba_semid, s, 1) == -1) {
778 perror("semop(1)");
779 (void) semctl(b->ba_semid, 0, IPC_RMID);
780 (void) munmap((void *)b, sizeof (barrier_t));
781 return (NULL);
782 }
783
784 b->ba_waiters = 0;
785 b->ba_phase = 0;
786
787 b->ba_count = 0;
788 b->ba_errors = 0;
789
790 return (b);
791 }
792
793 int
794 barrier_destroy(barrier_t *b)
795 {
796 (void) semctl(b->ba_semid, 0, IPC_RMID);
797 (void) munmap((void *)b, sizeof (barrier_t));
798
799 return (0);
800 }
801
802 int
803 barrier_queue(barrier_t *b, result_t *r)
804 {
805 struct sembuf s[2];
806
807 /*
808 * {s0(-(hwm-1))}
809 * if ! nowait {s1(-(hwm-1))}
810 * (all other threads)
811 * update shared stats
812 * {s0(hwm-1), s1(1)}
813 * {s0(1), s2(-1)}
814 * else
815 * (last thread)
816 * update shared stats
817 * {s2(hwm-1)}
818 */
819
820 s[0].sem_num = 0;
821 s[0].sem_op = -(b->ba_hwm - 1);
822 s[0].sem_flg = 0;
823 if (semop(b->ba_semid, s, 1) == -1) {
824 perror("semop(2)");
825 return (-1);
826 }
827
828 s[0].sem_num = 1;
829 s[0].sem_op = -(b->ba_hwm - 1);
830 s[0].sem_flg = IPC_NOWAIT;
831 if (semop(b->ba_semid, s, 1) == -1) {
832 if (errno != EAGAIN) {
833 perror("semop(3)");
834 return (-1);
835 }
836
837 /* all but the last thread */
838
839 if (r != NULL) {
840 update_stats(b, r);
841 }
842
843 b->ba_waiters++;
844
845 s[0].sem_num = 0;
846 s[0].sem_op = b->ba_hwm - 1;
847 s[0].sem_flg = 0;
848 s[1].sem_num = 1;
849 s[1].sem_op = 1;
850 s[1].sem_flg = 0;
851 if (semop(b->ba_semid, s, 2) == -1) {
852 perror("semop(4)");
853 return (-1);
854 }
855
856 s[0].sem_num = 0;
857 s[0].sem_op = 1;
858 s[0].sem_flg = 0;
859 s[1].sem_num = 2;
860 s[1].sem_op = -1;
861 s[1].sem_flg = 0;
862 if (semop(b->ba_semid, s, 2) == -1) {
863 perror("semop(5)");
864 return (-1);
865 }
866
867 } else {
868 /* the last thread */
869
870 if (r != NULL) {
871 update_stats(b, r);
872 }
873
874 b->ba_waiters = 0;
875 b->ba_phase++;
876
877 s[0].sem_num = 2;
878 s[0].sem_op = b->ba_hwm - 1;
879 s[0].sem_flg = 0;
880 if (semop(b->ba_semid, s, 1) == -1) {
881 perror("semop(6)");
882 return (-1);
883 }
884 }
885
886 return (0);
887 }
888
889 #else /* USE_SEMOP */
890
891 barrier_t *
892 barrier_create(int hwm, int datasize)
893 {
894 pthread_mutexattr_t attr;
895 pthread_condattr_t cattr;
896 barrier_t *b;
897
898 /*LINTED*/
899 b = (barrier_t *)mmap(NULL,
900 sizeof (barrier_t) + (datasize - 1) * sizeof (double),
901 PROT_READ | PROT_WRITE,
902 MAP_SHARED | MAP_ANON, -1, 0L);
903 if (b == (barrier_t *)MAP_FAILED) {
904 return (NULL);
905 }
906 b->ba_datasize = datasize;
907
908 b->ba_hwm = hwm;
909 b->ba_flag = 0;
910
911 (void) pthread_mutexattr_init(&attr);
912 (void) pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
913
914 (void) pthread_condattr_init(&cattr);
915 (void) pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
916
917 (void) pthread_mutex_init(&b->ba_lock, &attr);
918 (void) pthread_cond_init(&b->ba_cv, &cattr);
919
920 b->ba_waiters = 0;
921 b->ba_phase = 0;
922
923 b->ba_count = 0;
924 b->ba_errors = 0;
925
926 return (b);
927 }
928
929 int
930 barrier_destroy(barrier_t *b)
931 {
932 (void) munmap((void *)b, sizeof (barrier_t));
933
934 return (0);
935 }
936
937 int
938 barrier_queue(barrier_t *b, result_t *r)
939 {
940 int phase;
941
942 (void) pthread_mutex_lock(&b->ba_lock);
943
944 if (r != NULL) {
945 update_stats(b, r);
946 }
947
948 phase = b->ba_phase;
949
950 b->ba_waiters++;
951 if (b->ba_hwm == b->ba_waiters) {
952 b->ba_waiters = 0;
953 b->ba_phase++;
954 (void) pthread_cond_broadcast(&b->ba_cv);
955 }
956
957 while (b->ba_phase == phase) {
958 (void) pthread_cond_wait(&b->ba_cv, &b->ba_lock);
959 }
960
961 (void) pthread_mutex_unlock(&b->ba_lock);
962 return (0);
963 }
964 #endif /* USE_SEMOP */
965
966 int
967 gettindex()
968 {
969 int i;
970
971 if (tids == NULL) {
972 return (-1);
973 }
974
975 for (i = 1; i < lm_optT; i++) {
976 if (pthread_self() == tids[i]) {
977 return (i);
978 }
979 }
980
981 return (0);
982 }
983
984 int
985 getpindex()
986 {
987 return (pindex);
988 }
989
990 void *
991 gettsd(int p, int t)
992 {
993 if ((p < 0) || (p >= lm_optP) || (t < 0) || (t >= lm_optT))
994 return (NULL);
995
996 return ((void *)((unsigned long)tsdseg +
997 (((p * lm_optT) + t) * tsdsize)));
998 }
999
1000 #if defined(__APPLE__)
1001 int
1002 gettsdindex(void *arg){
1003 /*
1004 * gettindex() can race with pthread_create() filling in tids[].
1005 * This is an alternative approach to finding the calling thread's tsd in t
1006 sdseg
1007 */
1008 return tsdsize ? ((unsigned long)arg - (unsigned long)tsdseg)/tsdsize : 0;
1009 }
1010 #endif /* __APPLE__ */
1011
1012 #ifdef USE_GETHRTIME
1013 long long
1014 getnsecs()
1015 {
1016 return (gethrtime());
1017 }
1018
1019 long long
1020 getusecs()
1021 {
1022 return (gethrtime() / 1000);
1023 }
1024
1025 #elif USE_RDTSC /* USE_GETHRTIME */
1026
1027 __inline__ long long
1028 rdtsc(void)
1029 {
1030 unsigned long long x;
1031 __asm__ volatile(".byte 0x0f, 0x31" : "=A" (x));
1032 return (x);
1033 }
1034
1035 long long
1036 getusecs()
1037 {
1038 return (rdtsc() * 1000000 / lm_hz);
1039 }
1040
1041 long long
1042 getnsecs()
1043 {
1044 return (rdtsc() * 1000000000 / lm_hz);
1045 }
1046
1047 #else /* USE_GETHRTIME */
1048
1049 long long
1050 getusecs()
1051 {
1052 struct timeval tv;
1053
1054 (void) gettimeofday(&tv, NULL);
1055
1056 return ((long long)tv.tv_sec * 1000000LL + (long long) tv.tv_usec);
1057 }
1058
1059 long long
1060 getnsecs()
1061 {
1062 struct timeval tv;
1063
1064 (void) gettimeofday(&tv, NULL);
1065
1066 return ((long long)tv.tv_sec * 1000000000LL +
1067 (long long) tv.tv_usec * 1000LL);
1068 }
1069
1070 #endif /* USE_GETHRTIME */
1071
1072 int
1073 setfdlimit(int limit)
1074 {
1075 struct rlimit rlimit;
1076
1077 if (getrlimit(RLIMIT_NOFILE, &rlimit) < 0) {
1078 perror("getrlimit");
1079 exit(1);
1080 }
1081
1082 if (rlimit.rlim_cur > limit)
1083 return (0); /* no worries */
1084
1085 rlimit.rlim_cur = limit;
1086
1087 if (rlimit.rlim_max < limit)
1088 rlimit.rlim_max = limit;
1089
1090 if (setrlimit(RLIMIT_NOFILE, &rlimit) < 0) {
1091 perror("setrlimit");
1092 exit(3);
1093 }
1094
1095 return (0);
1096 }
1097
1098
1099 #define KILOBYTE 1024
1100 #define MEGABYTE (KILOBYTE * KILOBYTE)
1101 #define GIGABYTE (KILOBYTE * MEGABYTE)
1102
1103 long long
1104 sizetoll(const char *arg)
1105 {
1106 int len = strlen(arg);
1107 int i;
1108 long long mult = 1;
1109
1110 if (len && isalpha(arg[len - 1])) {
1111 switch (arg[len - 1]) {
1112
1113 case 'k':
1114 case 'K':
1115 mult = KILOBYTE;
1116 break;
1117 case 'm':
1118 case 'M':
1119 mult = MEGABYTE;
1120 break;
1121 case 'g':
1122 case 'G':
1123 mult = GIGABYTE;
1124 break;
1125 default:
1126 return (-1);
1127 }
1128
1129 for (i = 0; i < len - 1; i++)
1130 if (!isdigit(arg[i]))
1131 return (-1);
1132 }
1133
1134 return (mult * strtoll(arg, NULL, 10));
1135 }
1136
1137 int
1138 sizetoint(const char *arg)
1139 {
1140 int len = strlen(arg);
1141 int i;
1142 long long mult = 1;
1143
1144 if (len && isalpha(arg[len - 1])) {
1145 switch (arg[len - 1]) {
1146
1147 case 'k':
1148 case 'K':
1149 mult = KILOBYTE;
1150 break;
1151 case 'm':
1152 case 'M':
1153 mult = MEGABYTE;
1154 break;
1155 case 'g':
1156 case 'G':
1157 mult = GIGABYTE;
1158 break;
1159 default:
1160 return (-1);
1161 }
1162
1163 for (i = 0; i < len - 1; i++)
1164 if (!isdigit(arg[i]))
1165 return (-1);
1166 }
1167
1168 return (mult * atoi(arg));
1169 }
1170
1171 static void
1172 print_bar(long count, long total)
1173 {
1174 int i;
1175
1176 (void) putchar_unlocked(count ? '*' : ' ');
1177 for (i = 1; i < (32 * count) / total; i++)
1178 (void) putchar_unlocked('*');
1179 for (; i < 32; i++)
1180 (void) putchar_unlocked(' ');
1181 }
1182
1183 static int
1184 doublecmp(const void *p1, const void *p2)
1185 {
1186 double a = *((double *)p1);
1187 double b = *((double *)p2);
1188
1189 if (a > b)
1190 return (1);
1191 if (a < b)
1192 return (-1);
1193 return (0);
1194 }
1195
1196 static void
1197 print_histo(barrier_t *b)
1198 {
1199 int n;
1200 int i;
1201 int j;
1202 int last;
1203 long long maxcount;
1204 double sum;
1205 long long min;
1206 long long scale;
1207 double x;
1208 long long y;
1209 long long count;
1210 int i95;
1211 double p95;
1212 double r95;
1213 double m95;
1214 histo_t *histo;
1215
1216 (void) printf("# %12s %12s %32s %12s\n", "counts", "usecs/call",
1217 "", "means");
1218
1219 /* calculate how much data we've captured */
1220 n = b->ba_batches > b->ba_datasize ? b->ba_datasize : b->ba_batches;
1221
1222 /* find the 95th percentile - index, value and range */
1223 qsort((void *)b->ba_data, n, sizeof (double), doublecmp);
1224 min = b->ba_data[0] + 0.000001;
1225 i95 = n * 95 / 100;
1226 p95 = b->ba_data[i95];
1227 r95 = p95 - min + 1;
1228
1229 /* find a suitable min and scale */
1230 i = 0;
1231 x = r95 / (HISTOSIZE - 1);
1232 while (x >= 10.0) {
1233 x /= 10.0;
1234 i++;
1235 }
1236 y = x + 0.9999999999;
1237 while (i > 0) {
1238 y *= 10;
1239 i--;
1240 }
1241 min /= y;
1242 min *= y;
1243 scale = y * (HISTOSIZE - 1);
1244 if (scale < (HISTOSIZE - 1)) {
1245 scale = (HISTOSIZE - 1);
1246 }
1247
1248 /* create and initialise the histogram */
1249 histo = malloc(HISTOSIZE * sizeof (histo_t));
1250 for (i = 0; i < HISTOSIZE; i++) {
1251 histo[i].sum = 0.0;
1252 histo[i].count = 0;
1253 }
1254
1255 /* populate the histogram */
1256 last = 0;
1257 sum = 0.0;
1258 count = 0;
1259 for (i = 0; i < i95; i++) {
1260 j = (HISTOSIZE - 1) * (b->ba_data[i] - min) / scale;
1261
1262 if (j >= HISTOSIZE) {
1263 (void) printf("panic!\n");
1264 j = HISTOSIZE - 1;
1265 }
1266
1267 histo[j].sum += b->ba_data[i];
1268 histo[j].count++;
1269
1270 sum += b->ba_data[i];
1271 count++;
1272 }
1273 m95 = sum / count;
1274
1275 /* find the larges bucket */
1276 maxcount = 0;
1277 for (i = 0; i < HISTOSIZE; i++)
1278 if (histo[i].count > 0) {
1279 last = i;
1280 if (histo[i].count > maxcount)
1281 maxcount = histo[i].count;
1282 }
1283
1284 /* print the buckets */
1285 for (i = 0; i <= last; i++) {
1286 (void) printf("# %12lld %12.5f |", histo[i].count,
1287 (min + scale * (double)i / (HISTOSIZE - 1)));
1288
1289 print_bar(histo[i].count, maxcount);
1290
1291 if (histo[i].count > 0)
1292 (void) printf("%12.5f\n",
1293 histo[i].sum / histo[i].count);
1294 else
1295 (void) printf("%12s\n", "-");
1296 }
1297
1298 /* find the mean of values beyond the 95th percentile */
1299 sum = 0.0;
1300 count = 0;
1301 for (i = i95; i < n; i++) {
1302 sum += b->ba_data[i];
1303 count++;
1304 }
1305
1306 /* print the >95% bucket summary */
1307 (void) printf("#\n");
1308 (void) printf("# %12lld %12s |", count, "> 95%");
1309 print_bar(count, maxcount);
1310 if (count > 0)
1311 (void) printf("%12.5f\n", sum / count);
1312 else
1313 (void) printf("%12s\n", "-");
1314 (void) printf("#\n");
1315 (void) printf("# %12s %12.5f\n", "mean of 95%", m95);
1316 (void) printf("# %12s %12.5f\n", "95th %ile", p95);
1317
1318 /* quantify any buffer overflow */
1319 if (b->ba_batches > b->ba_datasize)
1320 (void) printf("# %12s %12d\n", "data dropped",
1321 b->ba_batches - b->ba_datasize);
1322 }
1323
1324 static void
1325 compute_stats(barrier_t *b)
1326 {
1327 int i;
1328
1329 if (b->ba_batches > b->ba_datasize)
1330 b->ba_batches = b->ba_datasize;
1331
1332 /*
1333 * convert to usecs/call
1334 */
1335
1336 for (i = 0; i < b->ba_batches; i++)
1337 b->ba_data[i] /= 1000.0;
1338
1339 /*
1340 * do raw stats
1341 */
1342
1343 (void) crunch_stats(b->ba_data, b->ba_batches, &b->ba_raw);
1344
1345 /*
1346 * recursively apply 3 sigma rule to remove outliers
1347 */
1348
1349 b->ba_corrected = b->ba_raw;
1350 b->ba_outliers = 0;
1351
1352 if (b->ba_batches > 40) { /* remove outliers */
1353 int removed;
1354
1355 do {
1356 removed = remove_outliers(b->ba_data, b->ba_batches,
1357 &b->ba_corrected);
1358 b->ba_outliers += removed;
1359 b->ba_batches -= removed;
1360 (void) crunch_stats(b->ba_data, b->ba_batches,
1361 &b->ba_corrected);
1362 } while (removed != 0 && b->ba_batches > 40);
1363 }
1364
1365 }
1366
1367 /*
1368 * routine to compute various statistics on array of doubles.
1369 */
1370
1371 static int
1372 crunch_stats(double *data, int count, stats_t *stats)
1373 {
1374 double a;
1375 double std;
1376 double diff;
1377 double sk;
1378 double ku;
1379 double mean;
1380 int i;
1381 int bytes;
1382 double *dupdata;
1383
1384 /*
1385 * first we need the mean
1386 */
1387
1388 mean = 0.0;
1389
1390 for (i = 0; i < count; i++) {
1391 mean += data[i];
1392 }
1393
1394 mean /= count;
1395
1396 stats->st_mean = mean;
1397
1398 /*
1399 * malloc and sort so we can do median
1400 */
1401
1402 dupdata = malloc(bytes = sizeof (double) * count);
1403 (void) memcpy(dupdata, data, bytes);
1404 qsort((void *)dupdata, count, sizeof (double), doublecmp);
1405 stats->st_median = dupdata[count/2];
1406
1407 /*
1408 * reuse dupdata to compute time correlation of data to
1409 * detect interesting time-based trends
1410 */
1411
1412 for (i = 0; i < count; i++)
1413 dupdata[i] = (double)i;
1414
1415 (void) fit_line(dupdata, data, count, &a, &stats->st_timecorr);
1416 free(dupdata);
1417
1418 std = 0.0;
1419 sk = 0.0;
1420 ku = 0.0;
1421
1422 stats->st_max = -1;
1423 stats->st_min = 1.0e99; /* hard to find portable values */
1424
1425 for (i = 0; i < count; i++) {
1426 if (data[i] > stats->st_max)
1427 stats->st_max = data[i];
1428 if (data[i] < stats->st_min)
1429 stats->st_min = data[i];
1430
1431 diff = data[i] - mean;
1432 std += diff * diff;
1433 sk += diff * diff * diff;
1434 ku += diff * diff * diff * diff;
1435 }
1436
1437 stats->st_stddev = std = sqrt(std/(double)(count - 1));
1438 stats->st_stderr = std / sqrt(count);
1439 stats->st_99confidence = stats->st_stderr * 2.326;
1440 stats->st_skew = sk / (std * std * std) / (double)(count);
1441 stats->st_kurtosis = ku / (std * std * std * std) /
1442 (double)(count) - 3;
1443
1444 return (0);
1445 }
1446
1447 /*
1448 * does a least squares fit to the set of points x, y and
1449 * fits a line y = a + bx. Returns a, b
1450 */
1451
1452 int
1453 fit_line(double *x, double *y, int count, double *a, double *b)
1454 {
1455 double sumx, sumy, sumxy, sumx2;
1456 double denom;
1457 int i;
1458
1459 sumx = sumy = sumxy = sumx2 = 0.0;
1460
1461 for (i = 0; i < count; i++) {
1462 sumx += x[i];
1463 sumx2 += x[i] * x[i];
1464 sumy += y[i];
1465 sumxy += x[i] * y[i];
1466 }
1467
1468 denom = count * sumx2 - sumx * sumx;
1469
1470 if (denom == 0.0)
1471 return (-1);
1472
1473 *a = (sumy * sumx2 - sumx * sumxy) / denom;
1474
1475 *b = (count * sumxy - sumx * sumy) / denom;
1476
1477 return (0);
1478 }
1479
1480 /*
1481 * empty function for measurement purposes
1482 */
1483
1484 int
1485 nop()
1486 {
1487 return (1);
1488 }
1489
1490 #define NSECITER 1000
1491
1492 static long long
1493 get_nsecs_overhead()
1494 {
1495 long long s;
1496
1497 double data[NSECITER];
1498 stats_t stats;
1499
1500 int i;
1501 int count;
1502 int outliers;
1503
1504 (void) getnsecs(); /* warmup */
1505 (void) getnsecs(); /* warmup */
1506 (void) getnsecs(); /* warmup */
1507
1508 i = 0;
1509
1510 count = NSECITER;
1511
1512 for (i = 0; i < count; i++) {
1513 s = getnsecs();
1514 data[i] = getnsecs() - s;
1515 }
1516
1517 (void) crunch_stats(data, count, &stats);
1518
1519 while ((outliers = remove_outliers(data, count, &stats)) != 0) {
1520 count -= outliers;
1521 (void) crunch_stats(data, count, &stats);
1522 }
1523
1524 return ((long long)stats.st_mean);
1525
1526 }
1527
1528 long long
1529 get_nsecs_resolution()
1530 {
1531 long long y[1000];
1532
1533 int i, j, nops, res;
1534 long long start, stop;
1535
1536 /*
1537 * first, figure out how many nops to use
1538 * to get any delta between time measurements.
1539 * use a minimum of one.
1540 */
1541
1542 /*
1543 * warm cache
1544 */
1545
1546 stop = start = getnsecs();
1547
1548 for (i = 1; i < 10000000; i++) {
1549 start = getnsecs();
1550 for (j = i; j; j--)
1551 ;
1552 stop = getnsecs();
1553 if (stop > start)
1554 break;
1555 }
1556
1557 nops = i;
1558
1559 /*
1560 * now collect data at linearly varying intervals
1561 */
1562
1563 for (i = 0; i < 1000; i++) {
1564 start = getnsecs();
1565 for (j = nops * i; j; j--)
1566 ;
1567 stop = getnsecs();
1568 y[i] = stop - start;
1569 }
1570
1571 /*
1572 * find smallest positive difference between samples;
1573 * this is the timer resolution
1574 */
1575
1576 res = 1<<30;
1577
1578 for (i = 1; i < 1000; i++) {
1579 int diff = y[i] - y[i-1];
1580
1581 if (diff > 0 && res > diff)
1582 res = diff;
1583
1584 }
1585
1586 return (res);
1587 }
1588
1589 /*
1590 * remove any data points from the array more than 3 sigma out
1591 */
1592
1593 static int
1594 remove_outliers(double *data, int count, stats_t *stats)
1595 {
1596 double outmin = stats->st_mean - 3 * stats->st_stddev;
1597 double outmax = stats->st_mean + 3 * stats->st_stddev;
1598
1599 int i, j, outliers;
1600
1601 for (outliers = i = j = 0; i < count; i++)
1602 if (data[i] > outmax || data[i] < outmin)
1603 outliers++;
1604 else
1605 data[j++] = data[i];
1606
1607 return (outliers);
1608 }