]> git.saurik.com Git - apple/xnu.git/blobdiff - tools/tests/libMicro/libmicro.c
xnu-1456.1.26.tar.gz
[apple/xnu.git] / tools / tests / libMicro / libmicro.c
diff --git a/tools/tests/libMicro/libmicro.c b/tools/tests/libMicro/libmicro.c
new file mode 100644 (file)
index 0000000..a3239c6
--- /dev/null
@@ -0,0 +1,1608 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms
+ * of the Common Development and Distribution License
+ * (the "License").  You may not use this file except
+ * in compliance with the License.
+ *
+ * You can obtain a copy of the license at
+ * src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL
+ * HEADER in each file and include the License file at
+ * usr/src/OPENSOLARIS.LICENSE.  If applicable,
+ * add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your
+ * own identifying information: Portions Copyright [yyyy]
+ * [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * benchmarking routines
+ */
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <ctype.h>
+#include <string.h>
+#include <strings.h>
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <poll.h>
+#include <pthread.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <sys/resource.h>
+#include <math.h>
+#include <limits.h>
+
+#ifdef __sun
+#include <sys/elf.h>
+#endif
+
+#include "libmicro.h"
+
+
+#if defined(__APPLE__)
+#include <mach/mach_time.h>
+
+long long
+gethrtime(void)
+{
+   long long        elapsed;
+   static long long        start;
+   static mach_timebase_info_data_t    sTimebaseInfo = { 0, 0 };
+
+   // If this is the first time we've run, get the timebase.
+   // We can use denom == 0 to indicate that sTimebaseInfo is
+   // uninitialised because it makes no sense to have a zero
+   // denominator in a fraction.
+
+   if ( sTimebaseInfo.denom == 0 ) {
+       (void) mach_timebase_info(&sTimebaseInfo);
+               start = mach_absolute_time();
+   }
+
+   elapsed = mach_absolute_time() - start;
+
+   // Convert to nanoseconds.
+       // return (elapsed * (long long)sTimebaseInfo.numer)/(long long)sTimebaseInfo.denom;
+       
+       // Provided the final result is representable in 64 bits the following maneuver will
+       // deliver that result without intermediate overflow.
+       if (sTimebaseInfo.denom == sTimebaseInfo.numer)
+               return elapsed;
+       else if (sTimebaseInfo.denom == 1)
+               return elapsed * (long long)sTimebaseInfo.numer;
+       else {
+       // Decompose elapsed = eta32 * 2^32 + eps32:
+       long long eta32 = elapsed >> 32;
+       long long eps32 = elapsed & 0x00000000ffffffffLL;
+
+       long long numer = sTimebaseInfo.numer, denom = sTimebaseInfo.denom;
+
+       // Form product of elapsed64 (decomposed) and numer:
+       long long mu64 = numer * eta32;
+       long long lambda64 = numer * eps32;
+
+       // Divide the constituents by denom:
+       long long q32 = mu64/denom;
+       long long r32 = mu64 - (q32 * denom); // mu64 % denom
+
+       return (q32 << 32) + ((r32 << 32) + lambda64)/denom;
+       }
+}
+
+#endif
+
+/*
+ * user visible globals
+ */
+
+int                            lm_argc = 0;
+char **                                lm_argv = NULL;
+
+int                            lm_opt1;
+int                            lm_optA;
+int                            lm_optB;
+int                            lm_optC = 100;
+int                            lm_optD;
+int                            lm_optE;
+int                            lm_optH;
+int                            lm_optI;
+int                            lm_optL = 0;
+int                            lm_optM = 0;
+char                           *lm_optN;
+int                            lm_optP;
+int                            lm_optS;
+int                            lm_optT;
+int                            lm_optW;
+
+int                            lm_def1 = 0;
+int                            lm_defB = 0; /* use lm_nsecs_per_op */
+int                            lm_defD = 10;
+int                            lm_defH = 0;
+char                           *lm_defN = NULL;
+int                            lm_defP = 1;
+
+int                            lm_defS = 0;
+int                            lm_defT = 1;
+
+/*
+ * default on fast platform, should be overridden by individual
+ * benchmarks if significantly wrong in either direction.
+ */
+
+int                            lm_nsecs_per_op = 5;
+
+char                           *lm_procpath;
+char                           lm_procname[STRSIZE];
+char                           lm_usage[STRSIZE];
+char                           lm_optstr[STRSIZE];
+char                           lm_header[STRSIZE];
+size_t                         lm_tsdsize = 0;
+
+
+/*
+ *  Globals we do not export to the user
+ */
+
+static barrier_t               *lm_barrier;
+static pid_t                   *pids = NULL;
+static pthread_t               *tids = NULL;
+static int                     pindex = -1;
+static void                    *tsdseg = NULL;
+static size_t                  tsdsize = 0;
+
+#ifdef USE_RDTSC
+static long long               lm_hz = 0;
+#endif
+
+
+/*
+ * Forward references
+ */
+
+static void            worker_process();
+static void            usage();
+static void            print_stats(barrier_t *);
+static void            print_histo(barrier_t *);
+static int             remove_outliers(double *, int, stats_t *);
+static long long       nsecs_overhead;
+static long long       nsecs_resolution;
+static long long       get_nsecs_overhead();
+static int             crunch_stats(double *, int, stats_t *);
+static void            compute_stats(barrier_t *);
+/*
+ * main routine; renamed in this file to allow linking with other
+ * files
+ */
+
+int
+actual_main(int argc, char *argv[])
+{
+       int                     i;
+       int                     opt;
+       extern char             *optarg;
+       char                    *tmp;
+       char                    optstr[256];
+       barrier_t               *b;
+       long long               startnsecs = getnsecs();
+
+#ifdef USE_RDTSC
+       if (getenv("LIBMICRO_HZ") == NULL) {
+               (void) printf("LIBMICRO_HZ needed but not set\n");
+               exit(1);
+       }
+       lm_hz = strtoll(getenv("LIBMICRO_HZ"), NULL, 10);
+#endif
+
+       lm_argc = argc;
+       lm_argv = argv;
+
+       /* before we do anything */
+       (void) benchmark_init();
+
+
+       nsecs_overhead = get_nsecs_overhead();
+       nsecs_resolution = get_nsecs_resolution();
+
+       /*
+        * Set defaults
+        */
+
+       lm_opt1 = lm_def1;
+       lm_optB = lm_defB;
+       lm_optD = lm_defD;
+       lm_optH = lm_defH;
+       lm_optN = lm_defN;
+       lm_optP = lm_defP;
+
+       lm_optS = lm_defS;
+       lm_optT = lm_defT;
+
+       /*
+        * squirrel away the path to the current
+        * binary in a way that works on both
+        * Linux and Solaris
+        */
+
+       if (*argv[0] == '/') {
+               lm_procpath = strdup(argv[0]);
+               *strrchr(lm_procpath, '/') = 0;
+       } else {
+               char path[1024];
+               (void) getcwd(path, 1024);
+               (void) strcat(path, "/");
+               (void) strcat(path, argv[0]);
+               *strrchr(path, '/') = 0;
+               lm_procpath = strdup(path);
+       }
+
+       /*
+        * name of binary
+        */
+
+       if ((tmp = strrchr(argv[0], '/')) == NULL)
+               (void) strcpy(lm_procname, argv[0]);
+       else
+               (void) strcpy(lm_procname, tmp + 1);
+
+       if (lm_optN == NULL) {
+               lm_optN = lm_procname;
+       }
+
+       /*
+        * Parse command line arguments
+        */
+
+       (void) sprintf(optstr, "1AB:C:D:EHI:LMN:P:RST:VW?%s", lm_optstr);
+       while ((opt = getopt(argc, argv, optstr)) != -1) {
+               switch (opt) {
+               case '1':
+                       lm_opt1 = 1;
+                       break;
+               case 'A':
+                       lm_optA = 1;
+                       break;
+               case 'B':
+                       lm_optB = sizetoint(optarg);
+                       break;
+               case 'C':
+                       lm_optC = sizetoint(optarg);
+                       break;
+               case 'D':
+                       lm_optD = sizetoint(optarg);
+                       break;
+               case 'E':
+                       lm_optE = 1;
+                       break;
+               case 'H':
+                       lm_optH = 1;
+                       break;
+               case 'I':
+                       lm_optI = sizetoint(optarg);
+                       break;
+               case 'L':
+                       lm_optL = 1;
+                       break;
+               case 'M':
+                       lm_optM = 1;
+                       break;
+               case 'N':
+                       lm_optN = optarg;
+                       break;
+               case 'P':
+                       lm_optP = sizetoint(optarg);
+                       break;
+               case 'S':
+                       lm_optS = 1;
+                       break;
+               case 'T':
+                       lm_optT = sizetoint(optarg);
+                       break;
+               case 'V':
+                       (void) printf("%s\n", LIBMICRO_VERSION);
+                       exit(0);
+                       break;
+               case 'W':
+                       lm_optW = 1;
+                       lm_optS = 1;
+                       break;
+               case '?':
+                       usage();
+                       exit(0);
+                       break;
+               default:
+                       if (benchmark_optswitch(opt, optarg) == -1) {
+                               usage();
+                               exit(0);
+                       }
+               }
+       }
+
+       /* deal with implicit and overriding options */
+       if (lm_opt1 && lm_optP > 1) {
+               lm_optP = 1;
+               (void) printf("warning: -1 overrides -P\n");
+       }
+
+       if (lm_optE) {
+               (void) fprintf(stderr, "Running:%20s", lm_optN);
+               (void) fflush(stderr);
+       }
+
+       if (lm_optB == 0) {
+               /*
+                * neither benchmark or user has specified the number
+                * of cnts/sample, so use computed value
+                */
+               if (lm_optI)
+                       lm_nsecs_per_op = lm_optI;
+#define BLOCK_TOCK_DURATION 10000 /* number of raw timer "tocks" ideally comprising a block of work */
+               lm_optB = nsecs_resolution * BLOCK_TOCK_DURATION / lm_nsecs_per_op;
+               if (lm_optB == 0)
+                       lm_optB = 1;
+       }
+
+       /*
+        * now that the options are set
+        */
+
+       if (benchmark_initrun() == -1) {
+               exit(1);
+       }
+
+       /* allocate dynamic data */
+       pids = (pid_t *)malloc(lm_optP * sizeof (pid_t));
+       if (pids == NULL) {
+               perror("malloc(pids)");
+               exit(1);
+       }
+       tids = (pthread_t *)malloc(lm_optT * sizeof (pthread_t));
+       if (tids == NULL) {
+               perror("malloc(tids)");
+               exit(1);
+       }
+
+       /* check that the case defines lm_tsdsize before proceeding */
+       if (lm_tsdsize == (size_t)-1) {
+               (void) fprintf(stderr, "error in benchmark_init: "
+                   "lm_tsdsize not set\n");
+               exit(1);
+       }
+
+       /* round up tsdsize to nearest 128 to eliminate false sharing */
+       tsdsize = ((lm_tsdsize + 127) / 128) * 128;
+
+       /* allocate sufficient TSD for each thread in each process */
+       tsdseg = (void *)mmap(NULL, lm_optT * lm_optP * tsdsize + 8192,
+           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0L);
+       if (tsdseg == NULL) {
+               perror("mmap(tsd)");
+               exit(1);
+       }
+
+       /* initialise worker synchronisation */
+       b = barrier_create(lm_optT * lm_optP, DATASIZE);
+       if (b == NULL) {
+               perror("barrier_create()");
+               exit(1);
+       }
+       lm_barrier = b;
+       b->ba_flag = 1;
+
+       /* need this here so that parent and children can call exit() */
+       (void) fflush(stdout);
+       (void) fflush(stderr);
+
+       /* when we started and when to stop */
+
+       b->ba_starttime = getnsecs();
+       b->ba_deadline = (long long) (b->ba_starttime + (lm_optD * 1000000LL));
+
+       /* do the work */
+       if (lm_opt1) {
+               /* single process, non-fork mode */
+               pindex = 0;
+               worker_process();
+       } else {
+               /* create worker processes */
+               for (i = 0; i < lm_optP; i++) {
+                       pids[i] = fork();
+
+                       switch (pids[i]) {
+                       case 0:
+                               pindex = i;
+                               worker_process();
+                               exit(0);
+                               break;
+                       case -1:
+                               perror("fork");
+                               exit(1);
+                               break;
+                       default:
+                               continue;
+                       }
+               }
+
+               /* wait for worker processes */
+               for (i = 0; i < lm_optP; i++) {
+                       if (pids[i] > 0) {
+                               (void) waitpid(pids[i], NULL, 0);
+                       }
+               }
+       }
+
+       b->ba_endtime = getnsecs();
+
+       /* compute results */
+
+       compute_stats(b);
+
+       /* print arguments benchmark was invoked with ? */
+       if (lm_optL) {
+               int l;
+               (void) printf("# %s ", argv[0]);
+               for (l = 1; l < argc; l++) {
+                       (void) printf("%s ", argv[l]);
+               }
+               (void) printf("\n");
+       }
+
+       /* print result header (unless suppressed) */
+       if (!lm_optH) {
+               (void) printf("%12s %3s %3s %12s %12s %8s %8s %s\n",
+                   "", "prc", "thr",
+                   "usecs/call",
+                   "samples", "errors", "cnt/samp", lm_header);
+       }
+
+       /* print result */
+
+       (void) printf("%-12s %3d %3d %12.5f %12d %8lld %8d %s\n",
+           lm_optN, lm_optP, lm_optT,
+           (lm_optM?b->ba_corrected.st_mean:b->ba_corrected.st_median),
+           b->ba_batches, b->ba_errors, lm_optB,
+           benchmark_result());
+
+       if (lm_optS) {
+               print_stats(b);
+       }
+
+       /* just incase something goes awry */
+       (void) fflush(stdout);
+       (void) fflush(stderr);
+
+       /* cleanup by stages */
+       (void) benchmark_finirun();
+       (void) barrier_destroy(b);
+       (void) benchmark_fini();
+
+       if (lm_optE) {
+               (void) fprintf(stderr, " for %12.5f seconds\n",
+                   (double)(getnsecs() - startnsecs) /
+                   1.e9);
+               (void) fflush(stderr);
+       }
+       return (0);
+}
+
+void *
+worker_thread(void *arg)
+{
+       result_t                r;
+       long long               last_sleep = 0;
+       long long               t;
+
+       r.re_errors = benchmark_initworker(arg);
+
+       while (lm_barrier->ba_flag) {
+               r.re_count = 0;
+               r.re_errors += benchmark_initbatch(arg);
+
+               /* sync to clock */
+
+               if (lm_optA && ((t = getnsecs()) - last_sleep) > 75000000LL) {
+                       (void) poll(0, 0, 10);
+                       last_sleep = t;
+               }
+               /* wait for it ... */
+               (void) barrier_queue(lm_barrier, NULL);
+
+               /* time the test */
+               r.re_t0 = getnsecs();
+               (void) benchmark(arg, &r);
+               r.re_t1 = getnsecs();
+
+               /* time to stop? */
+               if (r.re_t1 > lm_barrier->ba_deadline &&
+                   (!lm_optC || lm_optC < lm_barrier->ba_batches)) {
+                       lm_barrier->ba_flag = 0;
+               }
+
+               /* record results and sync */
+               (void) barrier_queue(lm_barrier, &r);
+
+               (void) benchmark_finibatch(arg);
+
+               r.re_errors = 0;
+       }
+
+       (void) benchmark_finiworker(arg);
+
+       return (0);
+}
+
+void
+worker_process()
+{
+       int                     i;
+       void                    *tsd;
+
+       for (i = 1; i < lm_optT; i++) {
+               tsd = gettsd(pindex, i);
+               if (pthread_create(&tids[i], NULL, worker_thread, tsd) != 0) {
+                       perror("pthread_create");
+                       exit(1);
+               }
+       }
+
+       tsd = gettsd(pindex, 0);
+       (void) worker_thread(tsd);
+
+       for (i = 1; i < lm_optT; i++) {
+               (void) pthread_join(tids[i], NULL);
+       }
+}
+
+void
+usage()
+{
+       (void) printf(
+           "usage: %s\n"
+           "       [-1] (single process; overrides -P > 1)\n"
+           "       [-A] (align with clock)\n"
+           "       [-B batch-size (default %d)]\n"
+           "       [-C minimum number of samples (default 0)]\n"
+           "       [-D duration in msecs (default %ds)]\n"
+           "       [-E (echo name to stderr)]\n"
+           "       [-H] (suppress headers)\n"
+           "       [-I] nsecs per op (used to compute batch size)"
+           "       [-L] (print argument line)\n"
+           "       [-M] (reports mean rather than median)\n"
+           "       [-N test-name (default '%s')]\n"
+           "       [-P processes (default %d)]\n"
+           "       [-S] (print detailed stats)\n"
+           "       [-T threads (default %d)]\n"
+           "       [-V] (print the libMicro version and exit)\n"
+           "       [-W] (flag possible benchmark problems)\n"
+           "%s\n",
+           lm_procname,
+           lm_defB, lm_defD, lm_procname, lm_defP, lm_defT,
+           lm_usage);
+}
+
+void
+print_warnings(barrier_t *b)
+{
+       int head = 0;
+       int increase;
+
+       if (b->ba_quant) {
+               if (!head++) {
+                       (void) printf("#\n# WARNINGS\n");
+               }
+               increase = (int)(floor((nsecs_resolution * 100.0) /
+                   ((double)lm_optB * b->ba_corrected.st_median * 1000.0)) +
+                   1.0);
+               (void) printf("#     Quantization error likely;"
+                   "increase batch size (-B option) %dX to avoid.\n",
+                   increase);
+       }
+
+       /*
+        * XXX should warn on median != mean by a lot
+        */
+
+       if (b->ba_errors) {
+               if (!head++) {
+                       (void) printf("#\n# WARNINGS\n");
+               }
+               (void) printf("#     Errors occured during benchmark.\n");
+       }
+}
+
+void
+print_stats(barrier_t *b)
+{
+       (void) printf("#\n");
+       (void) printf("# STATISTICS         %12s          %12s\n",
+           "usecs/call (raw)",
+           "usecs/call (outliers removed)");
+
+       if (b->ba_count == 0) {
+               (void) printf("zero samples\n");
+               return;
+       }
+
+       (void) printf("#                    min %12.5f            %12.5f\n",
+           b->ba_raw.st_min,
+           b->ba_corrected.st_min);
+
+       (void) printf("#                    max %12.5f            %12.5f\n",
+           b->ba_raw.st_max,
+           b->ba_corrected.st_max);
+       (void) printf("#                   mean %12.5f            %12.5f\n",
+           b->ba_raw.st_mean,
+           b->ba_corrected.st_mean);
+       (void) printf("#                 median %12.5f            %12.5f\n",
+           b->ba_raw.st_median,
+           b->ba_corrected.st_median);
+       (void) printf("#                 stddev %12.5f            %12.5f\n",
+           b->ba_raw.st_stddev,
+           b->ba_corrected.st_stddev);
+       (void) printf("#         standard error %12.5f            %12.5f\n",
+           b->ba_raw.st_stderr,
+           b->ba_corrected.st_stderr);
+       (void) printf("#   99%% confidence level %12.5f            %12.5f\n",
+           b->ba_raw.st_99confidence,
+           b->ba_corrected.st_99confidence);
+       (void) printf("#                   skew %12.5f            %12.5f\n",
+           b->ba_raw.st_skew,
+           b->ba_corrected.st_skew);
+       (void) printf("#               kurtosis %12.5f            %12.5f\n",
+           b->ba_raw.st_kurtosis,
+           b->ba_corrected.st_kurtosis);
+
+       (void) printf("#       time correlation %12.5f            %12.5f\n",
+           b->ba_raw.st_timecorr,
+           b->ba_corrected.st_timecorr);
+       (void) printf("#\n");
+
+       (void) printf("#           elasped time %12.5f\n", (b->ba_endtime -
+           b->ba_starttime) / 1.0e9);
+       (void) printf("#      number of samples %12d\n",   b->ba_batches);
+       (void) printf("#     number of outliers %12d\n", b->ba_outliers);
+       (void) printf("#      getnsecs overhead %12d\n", (int)nsecs_overhead);
+
+       (void) printf("#\n");
+       (void) printf("# DISTRIBUTION\n");
+
+       print_histo(b);
+
+       if (lm_optW) {
+               print_warnings(b);
+       }
+}
+
+void
+update_stats(barrier_t *b, result_t *r)
+{
+       double                  time;
+       double                  nsecs_per_call;
+
+       if (b->ba_waiters == 0) {
+               /* first thread only */
+               b->ba_t0 = r->re_t0;
+               b->ba_t1 = r->re_t1;
+               b->ba_count0 = 0;
+               b->ba_errors0 = 0;
+       } else {
+               /* all but first thread */
+               if (r->re_t0 < b->ba_t0) {
+                       b->ba_t0 = r->re_t0;
+               }
+               if (r->re_t1 > b->ba_t1) {
+                       b->ba_t1 = r->re_t1;
+               }
+       }
+
+       b->ba_count0  += r->re_count;
+       b->ba_errors0 += r->re_errors;
+
+       if (b->ba_waiters == b->ba_hwm - 1) {
+               /* last thread only */
+
+
+               time = (double)b->ba_t1 - (double)b->ba_t0 -
+                   (double)nsecs_overhead;
+
+               if (time < 100 * nsecs_resolution)
+                       b->ba_quant++;
+
+               /*
+                * normalize by procs * threads if not -U
+                */
+
+               nsecs_per_call = time / (double)b->ba_count0 *
+                   (double)(lm_optT * lm_optP);
+
+               b->ba_count  += b->ba_count0;
+               b->ba_errors += b->ba_errors0;
+
+               b->ba_data[b->ba_batches % b->ba_datasize] =
+                   nsecs_per_call;
+
+               b->ba_batches++;
+       }
+}
+
+#ifdef USE_SEMOP
+barrier_t *
+barrier_create(int hwm, int datasize)
+{
+       struct sembuf           s[1];
+       barrier_t               *b;
+
+       /*LINTED*/
+       b = (barrier_t *)mmap(NULL,
+           sizeof (barrier_t) + (datasize - 1) * sizeof (double),
+           PROT_READ | PROT_WRITE,
+           MAP_SHARED | MAP_ANON, -1, 0L);
+       if (b == (barrier_t *)MAP_FAILED) {
+               return (NULL);
+       }
+       b->ba_datasize = datasize;
+
+       b->ba_flag  = 0;
+       b->ba_hwm   = hwm;
+       b->ba_semid = semget(IPC_PRIVATE, 3, 0600);
+       if (b->ba_semid == -1) {
+               (void) munmap((void *)b, sizeof (barrier_t));
+               return (NULL);
+       }
+
+       /* [hwm - 1, 0, 0] */
+       s[0].sem_num = 0;
+       s[0].sem_op  = hwm - 1;
+       s[0].sem_flg = 0;
+       if (semop(b->ba_semid, s, 1) == -1) {
+               perror("semop(1)");
+               (void) semctl(b->ba_semid, 0, IPC_RMID);
+               (void) munmap((void *)b, sizeof (barrier_t));
+               return (NULL);
+       }
+
+       b->ba_waiters = 0;
+       b->ba_phase = 0;
+
+       b->ba_count = 0;
+       b->ba_errors = 0;
+
+       return (b);
+}
+
+int
+barrier_destroy(barrier_t *b)
+{
+       (void) semctl(b->ba_semid, 0, IPC_RMID);
+       (void) munmap((void *)b, sizeof (barrier_t));
+
+       return (0);
+}
+
+int
+barrier_queue(barrier_t *b, result_t *r)
+{
+       struct sembuf           s[2];
+
+       /*
+        * {s0(-(hwm-1))}
+        * if ! nowait {s1(-(hwm-1))}
+        *   (all other threads)
+        *   update shared stats
+        *   {s0(hwm-1), s1(1)}
+        *   {s0(1), s2(-1)}
+        * else
+        *   (last thread)
+        *   update shared stats
+        *   {s2(hwm-1)}
+        */
+
+       s[0].sem_num = 0;
+       s[0].sem_op  = -(b->ba_hwm - 1);
+       s[0].sem_flg = 0;
+       if (semop(b->ba_semid, s, 1) == -1) {
+               perror("semop(2)");
+               return (-1);
+       }
+
+       s[0].sem_num = 1;
+       s[0].sem_op  = -(b->ba_hwm - 1);
+       s[0].sem_flg = IPC_NOWAIT;
+       if (semop(b->ba_semid, s, 1) == -1) {
+               if (errno != EAGAIN) {
+                       perror("semop(3)");
+                       return (-1);
+               }
+
+               /* all but the last thread */
+
+               if (r != NULL) {
+                       update_stats(b, r);
+               }
+
+               b->ba_waiters++;
+
+               s[0].sem_num = 0;
+               s[0].sem_op  = b->ba_hwm - 1;
+               s[0].sem_flg = 0;
+               s[1].sem_num = 1;
+               s[1].sem_op  = 1;
+               s[1].sem_flg = 0;
+               if (semop(b->ba_semid, s, 2) == -1) {
+                       perror("semop(4)");
+                       return (-1);
+               }
+
+               s[0].sem_num = 0;
+               s[0].sem_op  = 1;
+               s[0].sem_flg = 0;
+               s[1].sem_num = 2;
+               s[1].sem_op  = -1;
+               s[1].sem_flg = 0;
+               if (semop(b->ba_semid, s, 2) == -1) {
+                       perror("semop(5)");
+                       return (-1);
+               }
+
+       } else {
+               /* the last thread */
+
+               if (r != NULL) {
+                       update_stats(b, r);
+               }
+
+               b->ba_waiters = 0;
+               b->ba_phase++;
+
+               s[0].sem_num = 2;
+               s[0].sem_op  = b->ba_hwm - 1;
+               s[0].sem_flg = 0;
+               if (semop(b->ba_semid, s, 1) == -1) {
+                       perror("semop(6)");
+                       return (-1);
+               }
+       }
+
+       return (0);
+}
+
+#else /* USE_SEMOP */
+
+barrier_t *
+barrier_create(int hwm, int datasize)
+{
+       pthread_mutexattr_t     attr;
+       pthread_condattr_t      cattr;
+       barrier_t               *b;
+
+       /*LINTED*/
+       b = (barrier_t *)mmap(NULL,
+           sizeof (barrier_t) + (datasize - 1) * sizeof (double),
+           PROT_READ | PROT_WRITE,
+           MAP_SHARED | MAP_ANON, -1, 0L);
+       if (b == (barrier_t *)MAP_FAILED) {
+               return (NULL);
+       }
+       b->ba_datasize = datasize;
+
+       b->ba_hwm = hwm;
+       b->ba_flag  = 0;
+
+       (void) pthread_mutexattr_init(&attr);
+       (void) pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+
+       (void) pthread_condattr_init(&cattr);
+       (void) pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
+
+       (void) pthread_mutex_init(&b->ba_lock, &attr);
+       (void) pthread_cond_init(&b->ba_cv, &cattr);
+
+       b->ba_waiters = 0;
+       b->ba_phase = 0;
+
+       b->ba_count = 0;
+       b->ba_errors = 0;
+
+       return (b);
+}
+
+int
+barrier_destroy(barrier_t *b)
+{
+       (void) munmap((void *)b, sizeof (barrier_t));
+
+       return (0);
+}
+
+int
+barrier_queue(barrier_t *b, result_t *r)
+{
+       int                     phase;
+
+       (void) pthread_mutex_lock(&b->ba_lock);
+
+       if (r != NULL) {
+               update_stats(b, r);
+       }
+
+       phase = b->ba_phase;
+
+       b->ba_waiters++;
+       if (b->ba_hwm == b->ba_waiters) {
+               b->ba_waiters = 0;
+               b->ba_phase++;
+               (void) pthread_cond_broadcast(&b->ba_cv);
+       }
+
+       while (b->ba_phase == phase) {
+               (void) pthread_cond_wait(&b->ba_cv, &b->ba_lock);
+       }
+
+       (void) pthread_mutex_unlock(&b->ba_lock);
+       return (0);
+}
+#endif /* USE_SEMOP */
+
+int
+gettindex()
+{
+       int                     i;
+
+       if (tids == NULL) {
+               return (-1);
+       }
+
+       for (i = 1; i < lm_optT; i++) {
+               if (pthread_self() == tids[i]) {
+                       return (i);
+               }
+       }
+
+       return (0);
+}
+
+int
+getpindex()
+{
+       return (pindex);
+}
+
+void *
+gettsd(int p, int t)
+{
+       if ((p < 0) || (p >= lm_optP) || (t < 0) || (t >= lm_optT))
+               return (NULL);
+
+       return ((void *)((unsigned long)tsdseg +
+           (((p * lm_optT) + t) * tsdsize)));
+}
+
+#if defined(__APPLE__)
+int
+gettsdindex(void *arg){
+        /*
+         * gettindex() can race with pthread_create() filling in tids[].
+         * This is an alternative approach to finding the calling thread's tsd in t
+sdseg
+         */
+        return tsdsize ? ((unsigned long)arg - (unsigned long)tsdseg)/tsdsize : 0;
+}
+#endif /* __APPLE__ */
+
+#ifdef USE_GETHRTIME
+long long
+getnsecs()
+{
+       return (gethrtime());
+}
+
+long long
+getusecs()
+{
+       return (gethrtime() / 1000);
+}
+
+#elif USE_RDTSC /* USE_GETHRTIME */
+
+__inline__ long long
+rdtsc(void)
+{
+       unsigned long long x;
+       __asm__ volatile(".byte 0x0f, 0x31" : "=A" (x));
+       return (x);
+}
+
+long long
+getusecs()
+{
+       return (rdtsc() * 1000000 / lm_hz);
+}
+
+long long
+getnsecs()
+{
+       return (rdtsc() * 1000000000 / lm_hz);
+}
+
+#else /* USE_GETHRTIME */
+
+long long
+getusecs()
+{
+       struct timeval          tv;
+
+       (void) gettimeofday(&tv, NULL);
+
+       return ((long long)tv.tv_sec * 1000000LL + (long long) tv.tv_usec);
+}
+
+long long
+getnsecs()
+{
+       struct timeval          tv;
+
+       (void) gettimeofday(&tv, NULL);
+
+       return ((long long)tv.tv_sec * 1000000000LL +
+           (long long) tv.tv_usec * 1000LL);
+}
+
+#endif /* USE_GETHRTIME */
+
+int
+setfdlimit(int limit)
+{
+       struct rlimit rlimit;
+
+       if (getrlimit(RLIMIT_NOFILE, &rlimit) < 0) {
+               perror("getrlimit");
+               exit(1);
+       }
+
+       if (rlimit.rlim_cur > limit)
+               return (0); /* no worries */
+
+       rlimit.rlim_cur = limit;
+
+       if (rlimit.rlim_max < limit)
+               rlimit.rlim_max = limit;
+
+       if (setrlimit(RLIMIT_NOFILE, &rlimit) < 0) {
+               perror("setrlimit");
+               exit(3);
+       }
+
+       return (0);
+}
+
+
+#define        KILOBYTE                1024
+#define        MEGABYTE                (KILOBYTE * KILOBYTE)
+#define        GIGABYTE                (KILOBYTE * MEGABYTE)
+
+long long
+sizetoll(const char *arg)
+{
+       int                     len = strlen(arg);
+       int                     i;
+       long long               mult = 1;
+
+       if (len && isalpha(arg[len - 1])) {
+               switch (arg[len - 1]) {
+
+               case 'k':
+               case 'K':
+                       mult = KILOBYTE;
+                       break;
+               case 'm':
+               case 'M':
+                       mult = MEGABYTE;
+                       break;
+               case 'g':
+               case 'G':
+                       mult = GIGABYTE;
+                       break;
+               default:
+                       return (-1);
+               }
+
+               for (i = 0; i < len - 1; i++)
+                       if (!isdigit(arg[i]))
+                               return (-1);
+       }
+
+       return (mult * strtoll(arg, NULL, 10));
+}
+
+int
+sizetoint(const char *arg)
+{
+       int                     len = strlen(arg);
+       int                     i;
+       long long               mult = 1;
+
+       if (len && isalpha(arg[len - 1])) {
+               switch (arg[len - 1]) {
+
+               case 'k':
+               case 'K':
+                       mult = KILOBYTE;
+                       break;
+               case 'm':
+               case 'M':
+                       mult = MEGABYTE;
+                       break;
+               case 'g':
+               case 'G':
+                       mult = GIGABYTE;
+                       break;
+               default:
+                       return (-1);
+               }
+
+               for (i = 0; i < len - 1; i++)
+                       if (!isdigit(arg[i]))
+                               return (-1);
+       }
+
+       return (mult * atoi(arg));
+}
+
+static void
+print_bar(long count, long total)
+{
+       int                     i;
+
+       (void) putchar_unlocked(count ? '*' : ' ');
+       for (i = 1; i < (32 * count) / total; i++)
+               (void) putchar_unlocked('*');
+       for (; i < 32; i++)
+               (void) putchar_unlocked(' ');
+}
+
+static int
+doublecmp(const void *p1, const void *p2)
+{
+       double a = *((double *)p1);
+       double b = *((double *)p2);
+
+       if (a > b)
+               return (1);
+       if (a < b)
+               return (-1);
+       return (0);
+}
+
+static void
+print_histo(barrier_t *b)
+{
+       int                     n;
+       int                     i;
+       int                     j;
+       int                     last;
+       long long               maxcount;
+       double                  sum;
+       long long               min;
+       long long               scale;
+       double                  x;
+       long long               y;
+       long long               count;
+       int                     i95;
+       double                  p95;
+       double                  r95;
+       double                  m95;
+       histo_t                 *histo;
+
+       (void) printf("#        %12s %12s %32s %12s\n", "counts", "usecs/call",
+           "", "means");
+
+       /* calculate how much data we've captured */
+       n = b->ba_batches > b->ba_datasize ? b->ba_datasize : b->ba_batches;
+
+       /* find the 95th percentile - index, value and range */
+       qsort((void *)b->ba_data, n, sizeof (double), doublecmp);
+       min = b->ba_data[0] + 0.000001;
+       i95 = n * 95 / 100;
+       p95 = b->ba_data[i95];
+       r95 = p95 - min + 1;
+
+       /* find a suitable min and scale */
+       i = 0;
+       x = r95 / (HISTOSIZE - 1);
+       while (x >= 10.0) {
+               x /= 10.0;
+               i++;
+       }
+       y = x + 0.9999999999;
+       while (i > 0) {
+               y *= 10;
+               i--;
+       }
+       min /= y;
+       min *= y;
+       scale = y * (HISTOSIZE - 1);
+       if (scale < (HISTOSIZE - 1)) {
+               scale = (HISTOSIZE - 1);
+       }
+
+       /* create and initialise the histogram */
+       histo = malloc(HISTOSIZE * sizeof (histo_t));
+       for (i = 0; i < HISTOSIZE; i++) {
+               histo[i].sum = 0.0;
+               histo[i].count = 0;
+       }
+
+       /* populate the histogram */
+       last = 0;
+       sum = 0.0;
+       count = 0;
+       for (i = 0; i < i95; i++) {
+               j = (HISTOSIZE - 1) * (b->ba_data[i] - min) / scale;
+
+               if (j >= HISTOSIZE) {
+                       (void) printf("panic!\n");
+                       j = HISTOSIZE - 1;
+               }
+
+               histo[j].sum += b->ba_data[i];
+               histo[j].count++;
+
+               sum += b->ba_data[i];
+               count++;
+       }
+       m95 = sum / count;
+
+       /* find the larges bucket */
+       maxcount = 0;
+       for (i = 0; i < HISTOSIZE; i++)
+               if (histo[i].count > 0) {
+                       last = i;
+                       if (histo[i].count > maxcount)
+                               maxcount = histo[i].count;
+               }
+
+       /* print the buckets */
+       for (i = 0; i <= last; i++) {
+               (void) printf("#       %12lld %12.5f |", histo[i].count,
+                   (min + scale * (double)i / (HISTOSIZE - 1)));
+
+               print_bar(histo[i].count, maxcount);
+
+               if (histo[i].count > 0)
+                       (void) printf("%12.5f\n",
+                           histo[i].sum / histo[i].count);
+               else
+                       (void) printf("%12s\n", "-");
+       }
+
+       /* find the mean of values beyond the 95th percentile */
+       sum = 0.0;
+       count = 0;
+       for (i = i95; i < n; i++) {
+               sum += b->ba_data[i];
+               count++;
+       }
+
+       /* print the >95% bucket summary */
+       (void) printf("#\n");
+       (void) printf("#       %12lld %12s |", count, "> 95%");
+       print_bar(count, maxcount);
+       if (count > 0)
+               (void) printf("%12.5f\n", sum / count);
+       else
+               (void) printf("%12s\n", "-");
+       (void) printf("#\n");
+       (void) printf("#       %12s %12.5f\n", "mean of 95%", m95);
+       (void) printf("#       %12s %12.5f\n", "95th %ile", p95);
+
+       /* quantify any buffer overflow */
+       if (b->ba_batches > b->ba_datasize)
+               (void) printf("#       %12s %12d\n", "data dropped",
+                   b->ba_batches - b->ba_datasize);
+}
+
+static void
+compute_stats(barrier_t *b)
+{
+       int i;
+
+       if (b->ba_batches > b->ba_datasize)
+               b->ba_batches = b->ba_datasize;
+
+       /*
+        * convert to usecs/call
+        */
+
+       for (i = 0; i < b->ba_batches; i++)
+               b->ba_data[i] /= 1000.0;
+
+       /*
+        * do raw stats
+        */
+
+       (void) crunch_stats(b->ba_data, b->ba_batches, &b->ba_raw);
+
+       /*
+        * recursively apply 3 sigma rule to remove outliers
+        */
+
+       b->ba_corrected = b->ba_raw;
+       b->ba_outliers = 0;
+
+       if (b->ba_batches > 40) { /* remove outliers */
+               int removed;
+
+               do {
+                       removed = remove_outliers(b->ba_data, b->ba_batches,
+                           &b->ba_corrected);
+                       b->ba_outliers += removed;
+                       b->ba_batches -= removed;
+                       (void) crunch_stats(b->ba_data, b->ba_batches,
+                           &b->ba_corrected);
+                       } while (removed != 0 && b->ba_batches > 40);
+       }
+
+}
+
+/*
+ * routine to compute various statistics on array of doubles.
+ */
+
+static int
+crunch_stats(double *data, int count, stats_t *stats)
+{
+       double a;
+       double std;
+       double diff;
+       double sk;
+       double ku;
+       double mean;
+       int i;
+       int bytes;
+       double *dupdata;
+
+       /*
+        * first we need the mean
+        */
+
+       mean = 0.0;
+
+       for (i = 0; i < count; i++) {
+               mean += data[i];
+       }
+
+       mean /= count;
+
+       stats->st_mean = mean;
+
+       /*
+        * malloc and sort so we can do median
+        */
+
+       dupdata = malloc(bytes = sizeof (double) * count);
+       (void) memcpy(dupdata, data, bytes);
+       qsort((void *)dupdata, count, sizeof (double), doublecmp);
+       stats->st_median   = dupdata[count/2];
+
+       /*
+        * reuse dupdata to compute time correlation of data to
+        * detect interesting time-based trends
+        */
+
+       for (i = 0; i < count; i++)
+               dupdata[i] = (double)i;
+
+       (void) fit_line(dupdata, data, count, &a, &stats->st_timecorr);
+       free(dupdata);
+
+       std = 0.0;
+       sk  = 0.0;
+       ku  = 0.0;
+
+       stats->st_max = -1;
+       stats->st_min = 1.0e99; /* hard to find portable values */
+
+       for (i = 0; i < count; i++) {
+               if (data[i] > stats->st_max)
+                       stats->st_max = data[i];
+               if (data[i] < stats->st_min)
+                       stats->st_min = data[i];
+
+               diff = data[i] - mean;
+               std += diff * diff;
+               sk  += diff * diff * diff;
+               ku  += diff * diff * diff * diff;
+       }
+
+       stats->st_stddev   = std = sqrt(std/(double)(count - 1));
+       stats->st_stderr   = std / sqrt(count);
+       stats->st_99confidence = stats->st_stderr * 2.326;
+       stats->st_skew     = sk / (std * std * std) / (double)(count);
+       stats->st_kurtosis = ku / (std * std * std * std) /
+           (double)(count) - 3;
+
+       return (0);
+}
+
+/*
+ * does a least squares fit to the set of points x, y and
+ * fits a line y = a + bx.  Returns a, b
+ */
+
+int
+fit_line(double *x, double *y, int count, double *a, double *b)
+{
+       double sumx, sumy, sumxy, sumx2;
+       double denom;
+       int i;
+
+       sumx = sumy = sumxy = sumx2 = 0.0;
+
+       for (i = 0; i < count; i++) {
+               sumx    += x[i];
+               sumx2   += x[i] * x[i];
+               sumy    += y[i];
+               sumxy   += x[i] * y[i];
+       }
+
+       denom = count * sumx2 - sumx * sumx;
+
+       if (denom == 0.0)
+               return (-1);
+
+       *a = (sumy * sumx2 - sumx * sumxy) / denom;
+
+       *b = (count * sumxy - sumx * sumy) / denom;
+
+       return (0);
+}
+
+/*
+ * empty function for measurement purposes
+ */
+
+int
+nop()
+{
+       return (1);
+}
+
+#define        NSECITER 1000
+
+static long long
+get_nsecs_overhead()
+{
+       long long s;
+
+       double data[NSECITER];
+       stats_t stats;
+
+       int i;
+       int count;
+       int outliers;
+
+       (void) getnsecs(); /* warmup */
+       (void) getnsecs(); /* warmup */
+       (void) getnsecs(); /* warmup */
+
+       i = 0;
+
+       count = NSECITER;
+
+       for (i = 0; i < count; i++) {
+               s = getnsecs();
+               data[i] = getnsecs() - s;
+       }
+
+       (void) crunch_stats(data, count, &stats);
+
+       while ((outliers = remove_outliers(data, count, &stats)) != 0) {
+               count -= outliers;
+               (void) crunch_stats(data, count, &stats);
+       }
+
+       return ((long long)stats.st_mean);
+
+}
+
+long long
+get_nsecs_resolution()
+{
+       long long y[1000];
+
+       int i, j, nops, res;
+       long long start, stop;
+
+       /*
+        * first, figure out how many nops to use
+        * to get any delta between time measurements.
+        * use a minimum of one.
+        */
+
+       /*
+        * warm cache
+        */
+
+       stop = start = getnsecs();
+
+       for (i = 1; i < 10000000; i++) {
+               start = getnsecs();
+               for (j = i; j; j--)
+                       ;
+               stop = getnsecs();
+               if (stop > start)
+                       break;
+       }
+
+       nops = i;
+
+       /*
+        * now collect data at linearly varying intervals
+        */
+
+       for (i = 0; i < 1000; i++) {
+               start = getnsecs();
+               for (j = nops * i; j; j--)
+                       ;
+               stop = getnsecs();
+               y[i] = stop - start;
+       }
+
+       /*
+        * find smallest positive difference between samples;
+        * this is the timer resolution
+        */
+
+       res = 1<<30;
+
+       for (i = 1; i < 1000; i++) {
+               int diff = y[i] - y[i-1];
+
+               if (diff > 0 && res > diff)
+                       res = diff;
+
+       }
+
+       return (res);
+}
+
+/*
+ * remove any data points from the array more than 3 sigma out
+ */
+
+static int
+remove_outliers(double *data, int count, stats_t *stats)
+{
+       double outmin = stats->st_mean - 3 * stats->st_stddev;
+       double outmax = stats->st_mean + 3 * stats->st_stddev;
+
+       int i, j, outliers;
+
+       for (outliers = i = j = 0; i < count; i++)
+               if (data[i] > outmax || data[i] < outmin)
+                       outliers++;
+               else
+                       data[j++] = data[i];
+
+       return (outliers);
+}