4  * The contents of this file are subject to the terms 
   5  * of the Common Development and Distribution License 
   6  * (the "License").  You may not use this file except 
   7  * in compliance with the License. 
   9  * You can obtain a copy of the license at 
  10  * src/OPENSOLARIS.LICENSE 
  11  * or http://www.opensolaris.org/os/licensing. 
  12  * See the License for the specific language governing 
  13  * permissions and limitations under the License. 
  15  * When distributing Covered Code, include this CDDL 
  16  * HEADER in each file and include the License file at 
  17  * usr/src/OPENSOLARIS.LICENSE.  If applicable, 
  18  * add the following below this CDDL HEADER, with the 
  19  * fields enclosed by brackets "[]" replaced with your 
  20  * own identifying information: Portions Copyright [yyyy] 
  21  * [name of copyright owner] 
  27  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved. 
  28  * Use is subject to license terms. 
  32  * benchmarking routines 
  35 #include <sys/types.h> 
  52 #include <sys/resource.h> 
  63 #if defined(__APPLE__) 
  64 #include <mach/mach_time.h> 
  70    static long long        start
; 
  71    static mach_timebase_info_data_t    sTimebaseInfo 
= { 0, 0 }; 
  73    // If this is the first time we've run, get the timebase. 
  74    // We can use denom == 0 to indicate that sTimebaseInfo is 
  75    // uninitialised because it makes no sense to have a zero 
  76    // denominator in a fraction. 
  78    if ( sTimebaseInfo
.denom 
== 0 ) { 
  79        (void) mach_timebase_info(&sTimebaseInfo
); 
  80                 start 
= mach_absolute_time(); 
  83    elapsed 
= mach_absolute_time() - start
; 
  85    // Convert to nanoseconds. 
  86         // return (elapsed * (long long)sTimebaseInfo.numer)/(long long)sTimebaseInfo.denom; 
  88         // Provided the final result is representable in 64 bits the following maneuver will 
  89         // deliver that result without intermediate overflow. 
  90         if (sTimebaseInfo
.denom 
== sTimebaseInfo
.numer
) 
  92         else if (sTimebaseInfo
.denom 
== 1) 
  93                 return elapsed 
* (long long)sTimebaseInfo
.numer
; 
  95        // Decompose elapsed = eta32 * 2^32 + eps32: 
  96        long long eta32 
= elapsed 
>> 32; 
  97        long long eps32 
= elapsed 
& 0x00000000ffffffffLL
; 
  99        long long numer 
= sTimebaseInfo
.numer
, denom 
= sTimebaseInfo
.denom
; 
 101        // Form product of elapsed64 (decomposed) and numer: 
 102        long long mu64 
= numer 
* eta32
; 
 103        long long lambda64 
= numer 
* eps32
; 
 105        // Divide the constituents by denom: 
 106        long long q32 
= mu64
/denom
; 
 107        long long r32 
= mu64 
- (q32 
* denom
); // mu64 % denom 
 109        return (q32 
<< 32) + ((r32 
<< 32) + lambda64
)/denom
; 
 116  * user visible globals 
 120 char **                         lm_argv 
= NULL
; 
 139 int                             lm_defB 
= 0; /* use lm_nsecs_per_op */ 
 142 char                            *lm_defN 
= NULL
; 
 149  * default on fast platform, should be overridden by individual 
 150  * benchmarks if significantly wrong in either direction. 
 153 int                             lm_nsecs_per_op 
= 5; 
 156 char                            lm_procname
[STRSIZE
]; 
 157 char                            lm_usage
[STRSIZE
]; 
 158 char                            lm_optstr
[STRSIZE
]; 
 159 char                            lm_header
[STRSIZE
]; 
 160 size_t                          lm_tsdsize 
= 0; 
 164  *  Globals we do not export to the user 
 167 static barrier_t                
*lm_barrier
; 
 168 static pid_t                    
*pids 
= NULL
; 
 169 static pthread_t                
*tids 
= NULL
; 
 170 static int                      pindex 
= -1; 
 171 static void                     *tsdseg 
= NULL
; 
 172 static size_t                   tsdsize 
= 0; 
 175 static long long                lm_hz 
= 0; 
 183 static void             worker_process(); 
 185 static void             print_stats(barrier_t 
*); 
 186 static void             print_histo(barrier_t 
*); 
 187 static int              remove_outliers(double *, int, stats_t 
*); 
 188 static long long        nsecs_overhead
; 
 189 static long long        nsecs_resolution
; 
 190 static long long        get_nsecs_overhead(); 
 191 static int              crunch_stats(double *, int, stats_t 
*); 
 192 static void             compute_stats(barrier_t 
*); 
 194  * main routine; renamed in this file to allow linking with other 
 199 actual_main(int argc
, char *argv
[]) 
 207         long long               startnsecs 
= getnsecs(); 
 210         if (getenv("LIBMICRO_HZ") == NULL
) { 
 211                 (void) printf("LIBMICRO_HZ needed but not set\n"); 
 214         lm_hz 
= strtoll(getenv("LIBMICRO_HZ"), NULL
, 10); 
 220         /* before we do anything */ 
 221         (void) benchmark_init(); 
 224         nsecs_overhead 
= get_nsecs_overhead(); 
 225         nsecs_resolution 
= get_nsecs_resolution(); 
 242          * squirrel away the path to the current 
 243          * binary in a way that works on both 
 247         if (*argv
[0] == '/') { 
 248                 lm_procpath 
= strdup(argv
[0]); 
 249                 *strrchr(lm_procpath
, '/') = 0; 
 252                 (void) getcwd(path
, 1024); 
 253                 (void) strcat(path
, "/"); 
 254                 (void) strcat(path
, argv
[0]); 
 255                 *strrchr(path
, '/') = 0; 
 256                 lm_procpath 
= strdup(path
); 
 263         if ((tmp 
= strrchr(argv
[0], '/')) == NULL
) 
 264                 (void) strcpy(lm_procname
, argv
[0]); 
 266                 (void) strcpy(lm_procname
, tmp 
+ 1); 
 268         if (lm_optN 
== NULL
) { 
 269                 lm_optN 
= lm_procname
; 
 273          * Parse command line arguments 
 276         (void) sprintf(optstr
, "1AB:C:D:EHI:LMN:P:RST:VW?%s", lm_optstr
); 
 277         while ((opt 
= getopt(argc
, argv
, optstr
)) != -1) { 
 286                         lm_optB 
= sizetoint(optarg
); 
 289                         lm_optC 
= sizetoint(optarg
); 
 292                         lm_optD 
= sizetoint(optarg
); 
 301                         lm_optI 
= sizetoint(optarg
); 
 313                         lm_optP 
= sizetoint(optarg
); 
 319                         lm_optT 
= sizetoint(optarg
); 
 322                         (void) printf("%s\n", LIBMICRO_VERSION
); 
 334                         if (benchmark_optswitch(opt
, optarg
) == -1) { 
 341         /* deal with implicit and overriding options */ 
 342         if (lm_opt1 
&& lm_optP 
> 1) { 
 344                 (void) printf("warning: -1 overrides -P\n"); 
 348                 (void) fprintf(stderr
, "Running:%20s", lm_optN
); 
 349                 (void) fflush(stderr
); 
 354                  * neither benchmark or user has specified the number 
 355                  * of cnts/sample, so use computed value 
 358                         lm_nsecs_per_op 
= lm_optI
; 
 359 #define BLOCK_TOCK_DURATION 10000 /* number of raw timer "tocks" ideally comprising a block of work */ 
 360                 lm_optB 
= nsecs_resolution 
* BLOCK_TOCK_DURATION 
/ lm_nsecs_per_op
; 
 366          * now that the options are set 
 369         if (benchmark_initrun() == -1) { 
 373         /* allocate dynamic data */ 
 374         pids 
= (pid_t 
*)malloc(lm_optP 
* sizeof (pid_t
)); 
 376                 perror("malloc(pids)"); 
 379         tids 
= (pthread_t 
*)malloc(lm_optT 
* sizeof (pthread_t
)); 
 381                 perror("malloc(tids)"); 
 385         /* check that the case defines lm_tsdsize before proceeding */ 
 386         if (lm_tsdsize 
== (size_t)-1) { 
 387                 (void) fprintf(stderr
, "error in benchmark_init: " 
 388                     "lm_tsdsize not set\n"); 
 392         /* round up tsdsize to nearest 128 to eliminate false sharing */ 
 393         tsdsize 
= ((lm_tsdsize 
+ 127) / 128) * 128; 
 395         /* allocate sufficient TSD for each thread in each process */ 
 396         tsdseg 
= (void *)mmap(NULL
, lm_optT 
* lm_optP 
* tsdsize 
+ 8192, 
 397             PROT_READ 
| PROT_WRITE
, MAP_SHARED 
| MAP_ANON
, -1, 0L); 
 398         if (tsdseg 
== NULL
) { 
 403         /* initialise worker synchronisation */ 
 404         b 
= barrier_create(lm_optT 
* lm_optP
, DATASIZE
); 
 406                 perror("barrier_create()"); 
 412         /* need this here so that parent and children can call exit() */ 
 413         (void) fflush(stdout
); 
 414         (void) fflush(stderr
); 
 416         /* when we started and when to stop */ 
 418         b
->ba_starttime 
= getnsecs(); 
 419         b
->ba_deadline 
= (long long) (b
->ba_starttime 
+ (lm_optD 
* 1000000LL)); 
 423                 /* single process, non-fork mode */ 
 427                 /* create worker processes */ 
 428                 for (i 
= 0; i 
< lm_optP
; i
++) { 
 446                 /* wait for worker processes */ 
 447                 for (i 
= 0; i 
< lm_optP
; i
++) { 
 449                                 (void) waitpid(pids
[i
], NULL
, 0); 
 454         b
->ba_endtime 
= getnsecs(); 
 456         /* compute results */ 
 460         /* print arguments benchmark was invoked with ? */ 
 463                 (void) printf("# %s ", argv
[0]); 
 464                 for (l 
= 1; l 
< argc
; l
++) { 
 465                         (void) printf("%s ", argv
[l
]); 
 470         /* print result header (unless suppressed) */ 
 472                 (void) printf("%12s %3s %3s %12s %12s %8s %8s %s\n", 
 475                     "samples", "errors", "cnt/samp", lm_header
); 
 480         (void) printf("%-12s %3d %3d %12.5f %12d %8lld %8d %s\n", 
 481             lm_optN
, lm_optP
, lm_optT
, 
 482             (lm_optM
?b
->ba_corrected
.st_mean
:b
->ba_corrected
.st_median
), 
 483             b
->ba_batches
, b
->ba_errors
, lm_optB
, 
 490         /* just incase something goes awry */ 
 491         (void) fflush(stdout
); 
 492         (void) fflush(stderr
); 
 494         /* cleanup by stages */ 
 495         (void) benchmark_finirun(); 
 496         (void) barrier_destroy(b
); 
 497         (void) benchmark_fini(); 
 500                 (void) fprintf(stderr
, " for %12.5f seconds\n", 
 501                     (double)(getnsecs() - startnsecs
) / 
 503                 (void) fflush(stderr
); 
 509 worker_thread(void *arg
) 
 512         long long               last_sleep 
= 0; 
 515         r
.re_errors 
= benchmark_initworker(arg
); 
 517         while (lm_barrier
->ba_flag
) { 
 519                 r
.re_errors 
+= benchmark_initbatch(arg
); 
 523                 if (lm_optA 
&& ((t 
= getnsecs()) - last_sleep
) > 75000000LL) { 
 524                         (void) poll(0, 0, 10); 
 527                 /* wait for it ... */ 
 528                 (void) barrier_queue(lm_barrier
, NULL
); 
 531                 r
.re_t0 
= getnsecs(); 
 532                 (void) benchmark(arg
, &r
); 
 533                 r
.re_t1 
= getnsecs(); 
 536                 if (r
.re_t1 
> lm_barrier
->ba_deadline 
&& 
 537                     (!lm_optC 
|| lm_optC 
< lm_barrier
->ba_batches
)) { 
 538                         lm_barrier
->ba_flag 
= 0; 
 541                 /* record results and sync */ 
 542                 (void) barrier_queue(lm_barrier
, &r
); 
 544                 (void) benchmark_finibatch(arg
); 
 549         (void) benchmark_finiworker(arg
); 
 560         for (i 
= 1; i 
< lm_optT
; i
++) { 
 561                 tsd 
= gettsd(pindex
, i
); 
 562                 if (pthread_create(&tids
[i
], NULL
, worker_thread
, tsd
) != 0) { 
 563                         perror("pthread_create"); 
 568         tsd 
= gettsd(pindex
, 0); 
 569         (void) worker_thread(tsd
); 
 571         for (i 
= 1; i 
< lm_optT
; i
++) { 
 572                 (void) pthread_join(tids
[i
], NULL
); 
 581             "       [-1] (single process; overrides -P > 1)\n" 
 582             "       [-A] (align with clock)\n" 
 583             "       [-B batch-size (default %d)]\n" 
 584             "       [-C minimum number of samples (default 0)]\n" 
 585             "       [-D duration in msecs (default %ds)]\n" 
 586             "       [-E (echo name to stderr)]\n" 
 587             "       [-H] (suppress headers)\n" 
 588             "       [-I] nsecs per op (used to compute batch size)" 
 589             "       [-L] (print argument line)\n" 
 590             "       [-M] (reports mean rather than median)\n" 
 591             "       [-N test-name (default '%s')]\n" 
 592             "       [-P processes (default %d)]\n" 
 593             "       [-S] (print detailed stats)\n" 
 594             "       [-T threads (default %d)]\n" 
 595             "       [-V] (print the libMicro version and exit)\n" 
 596             "       [-W] (flag possible benchmark problems)\n" 
 599             lm_defB
, lm_defD
, lm_procname
, lm_defP
, lm_defT
, 
 604 print_warnings(barrier_t 
*b
) 
 611                         (void) printf("#\n# WARNINGS\n"); 
 613                 increase 
= (int)(floor((nsecs_resolution 
* 100.0) / 
 614                     ((double)lm_optB 
* b
->ba_corrected
.st_median 
* 1000.0)) + 
 616                 (void) printf("#     Quantization error likely;" 
 617                     "increase batch size (-B option) %dX to avoid.\n", 
 622          * XXX should warn on median != mean by a lot 
 627                         (void) printf("#\n# WARNINGS\n"); 
 629                 (void) printf("#     Errors occured during benchmark.\n"); 
 634 print_stats(barrier_t 
*b
) 
 636         (void) printf("#\n"); 
 637         (void) printf("# STATISTICS         %12s          %12s\n", 
 639             "usecs/call (outliers removed)"); 
 641         if (b
->ba_count 
== 0) { 
 642                 (void) printf("zero samples\n"); 
 646         (void) printf("#                    min %12.5f            %12.5f\n", 
 648             b
->ba_corrected
.st_min
); 
 650         (void) printf("#                    max %12.5f            %12.5f\n", 
 652             b
->ba_corrected
.st_max
); 
 653         (void) printf("#                   mean %12.5f            %12.5f\n", 
 655             b
->ba_corrected
.st_mean
); 
 656         (void) printf("#                 median %12.5f            %12.5f\n", 
 658             b
->ba_corrected
.st_median
); 
 659         (void) printf("#                 stddev %12.5f            %12.5f\n", 
 661             b
->ba_corrected
.st_stddev
); 
 662         (void) printf("#         standard error %12.5f            %12.5f\n", 
 664             b
->ba_corrected
.st_stderr
); 
 665         (void) printf("#   99%% confidence level %12.5f            %12.5f\n", 
 666             b
->ba_raw
.st_99confidence
, 
 667             b
->ba_corrected
.st_99confidence
); 
 668         (void) printf("#                   skew %12.5f            %12.5f\n", 
 670             b
->ba_corrected
.st_skew
); 
 671         (void) printf("#               kurtosis %12.5f            %12.5f\n", 
 672             b
->ba_raw
.st_kurtosis
, 
 673             b
->ba_corrected
.st_kurtosis
); 
 675         (void) printf("#       time correlation %12.5f            %12.5f\n", 
 676             b
->ba_raw
.st_timecorr
, 
 677             b
->ba_corrected
.st_timecorr
); 
 678         (void) printf("#\n"); 
 680         (void) printf("#           elasped time %12.5f\n", (b
->ba_endtime 
- 
 681             b
->ba_starttime
) / 1.0e9
); 
 682         (void) printf("#      number of samples %12d\n",   b
->ba_batches
); 
 683         (void) printf("#     number of outliers %12d\n", b
->ba_outliers
); 
 684         (void) printf("#      getnsecs overhead %12d\n", (int)nsecs_overhead
); 
 686         (void) printf("#\n"); 
 687         (void) printf("# DISTRIBUTION\n"); 
 697 update_stats(barrier_t 
*b
, result_t 
*r
) 
 700         double                  nsecs_per_call
; 
 702         if (b
->ba_waiters 
== 0) { 
 703                 /* first thread only */ 
 709                 /* all but first thread */ 
 710                 if (r
->re_t0 
< b
->ba_t0
) { 
 713                 if (r
->re_t1 
> b
->ba_t1
) { 
 718         b
->ba_count0  
+= r
->re_count
; 
 719         b
->ba_errors0 
+= r
->re_errors
; 
 721         if (b
->ba_waiters 
== b
->ba_hwm 
- 1) { 
 722                 /* last thread only */ 
 725                 time 
= (double)b
->ba_t1 
- (double)b
->ba_t0 
- 
 726                     (double)nsecs_overhead
; 
 728                 if (time 
< 100 * nsecs_resolution
) 
 732                  * normalize by procs * threads if not -U 
 735                 nsecs_per_call 
= time 
/ (double)b
->ba_count0 
* 
 736                     (double)(lm_optT 
* lm_optP
); 
 738                 b
->ba_count  
+= b
->ba_count0
; 
 739                 b
->ba_errors 
+= b
->ba_errors0
; 
 741                 b
->ba_data
[b
->ba_batches 
% b
->ba_datasize
] = 
 750 barrier_create(int hwm
, int datasize
) 
 756         b 
= (barrier_t 
*)mmap(NULL
, 
 757             sizeof (barrier_t
) + (datasize 
- 1) * sizeof (double), 
 758             PROT_READ 
| PROT_WRITE
, 
 759             MAP_SHARED 
| MAP_ANON
, -1, 0L); 
 760         if (b 
== (barrier_t 
*)MAP_FAILED
) { 
 763         b
->ba_datasize 
= datasize
; 
 767         b
->ba_semid 
= semget(IPC_PRIVATE
, 3, 0600); 
 768         if (b
->ba_semid 
== -1) { 
 769                 (void) munmap((void *)b
, sizeof (barrier_t
)); 
 773         /* [hwm - 1, 0, 0] */ 
 775         s
[0].sem_op  
= hwm 
- 1; 
 777         if (semop(b
->ba_semid
, s
, 1) == -1) { 
 779                 (void) semctl(b
->ba_semid
, 0, IPC_RMID
); 
 780                 (void) munmap((void *)b
, sizeof (barrier_t
)); 
 794 barrier_destroy(barrier_t 
*b
) 
 796         (void) semctl(b
->ba_semid
, 0, IPC_RMID
); 
 797         (void) munmap((void *)b
, sizeof (barrier_t
)); 
 803 barrier_queue(barrier_t 
*b
, result_t 
*r
) 
 809          * if ! nowait {s1(-(hwm-1))} 
 810          *   (all other threads) 
 811          *   update shared stats 
 816          *   update shared stats 
 821         s
[0].sem_op  
= -(b
->ba_hwm 
- 1); 
 823         if (semop(b
->ba_semid
, s
, 1) == -1) { 
 829         s
[0].sem_op  
= -(b
->ba_hwm 
- 1); 
 830         s
[0].sem_flg 
= IPC_NOWAIT
; 
 831         if (semop(b
->ba_semid
, s
, 1) == -1) { 
 832                 if (errno 
!= EAGAIN
) { 
 837                 /* all but the last thread */ 
 846                 s
[0].sem_op  
= b
->ba_hwm 
- 1; 
 851                 if (semop(b
->ba_semid
, s
, 2) == -1) { 
 862                 if (semop(b
->ba_semid
, s
, 2) == -1) { 
 868                 /* the last thread */ 
 878                 s
[0].sem_op  
= b
->ba_hwm 
- 1; 
 880                 if (semop(b
->ba_semid
, s
, 1) == -1) { 
 889 #else /* USE_SEMOP */ 
 892 barrier_create(int hwm
, int datasize
) 
 894         pthread_mutexattr_t     attr
; 
 895         pthread_condattr_t      cattr
; 
 899         b 
= (barrier_t 
*)mmap(NULL
, 
 900             sizeof (barrier_t
) + (datasize 
- 1) * sizeof (double), 
 901             PROT_READ 
| PROT_WRITE
, 
 902             MAP_SHARED 
| MAP_ANON
, -1, 0L); 
 903         if (b 
== (barrier_t 
*)MAP_FAILED
) { 
 906         b
->ba_datasize 
= datasize
; 
 911         (void) pthread_mutexattr_init(&attr
); 
 912         (void) pthread_mutexattr_setpshared(&attr
, PTHREAD_PROCESS_SHARED
); 
 914         (void) pthread_condattr_init(&cattr
); 
 915         (void) pthread_condattr_setpshared(&cattr
, PTHREAD_PROCESS_SHARED
); 
 917         (void) pthread_mutex_init(&b
->ba_lock
, &attr
); 
 918         (void) pthread_cond_init(&b
->ba_cv
, &cattr
); 
 930 barrier_destroy(barrier_t 
*b
) 
 932         (void) munmap((void *)b
, sizeof (barrier_t
)); 
 938 barrier_queue(barrier_t 
*b
, result_t 
*r
) 
 942         (void) pthread_mutex_lock(&b
->ba_lock
); 
 951         if (b
->ba_hwm 
== b
->ba_waiters
) { 
 954                 (void) pthread_cond_broadcast(&b
->ba_cv
); 
 957         while (b
->ba_phase 
== phase
) { 
 958                 (void) pthread_cond_wait(&b
->ba_cv
, &b
->ba_lock
); 
 961         (void) pthread_mutex_unlock(&b
->ba_lock
); 
 964 #endif /* USE_SEMOP */ 
 975         for (i 
= 1; i 
< lm_optT
; i
++) { 
 976                 if (pthread_self() == tids
[i
]) { 
 993         if ((p 
< 0) || (p 
>= lm_optP
) || (t 
< 0) || (t 
>= lm_optT
)) 
 996         return ((void *)((unsigned long)tsdseg 
+ 
 997             (((p 
* lm_optT
) + t
) * tsdsize
))); 
1000 #if defined(__APPLE__) 
1002 gettsdindex(void *arg
){ 
1004          * gettindex() can race with pthread_create() filling in tids[]. 
1005          * This is an alternative approach to finding the calling thread's tsd in t 
1008         return tsdsize 
? ((unsigned long)arg 
- (unsigned long)tsdseg
)/tsdsize 
: 0; 
1010 #endif /* __APPLE__ */ 
1012 #ifdef USE_GETHRTIME 
1016         return (gethrtime()); 
1022         return (gethrtime() / 1000); 
1025 #elif USE_RDTSC /* USE_GETHRTIME */ 
1027 __inline__ 
long long 
1030         unsigned long long x
; 
1031         __asm__ 
volatile(".byte 0x0f, 0x31" : "=A" (x
)); 
1038         return (rdtsc() * 1000000 / lm_hz
); 
1044         return (rdtsc() * 1000000000 / lm_hz
); 
1047 #else /* USE_GETHRTIME */ 
1054         (void) gettimeofday(&tv
, NULL
); 
1056         return ((long long)tv
.tv_sec 
* 1000000LL + (long long) tv
.tv_usec
); 
1064         (void) gettimeofday(&tv
, NULL
); 
1066         return ((long long)tv
.tv_sec 
* 1000000000LL + 
1067             (long long) tv
.tv_usec 
* 1000LL); 
1070 #endif /* USE_GETHRTIME */ 
1073 setfdlimit(int limit
) 
1075         struct rlimit rlimit
; 
1077         if (getrlimit(RLIMIT_NOFILE
, &rlimit
) < 0) { 
1078                 perror("getrlimit"); 
1082         if (rlimit
.rlim_cur 
> limit
) 
1083                 return (0); /* no worries */ 
1085         rlimit
.rlim_cur 
= limit
; 
1087         if (rlimit
.rlim_max 
< limit
) 
1088                 rlimit
.rlim_max 
= limit
; 
1090         if (setrlimit(RLIMIT_NOFILE
, &rlimit
) < 0) { 
1091                 perror("setrlimit"); 
1099 #define KILOBYTE                1024 
1100 #define MEGABYTE                (KILOBYTE * KILOBYTE) 
1101 #define GIGABYTE                (KILOBYTE * MEGABYTE) 
1104 sizetoll(const char *arg
) 
1106         int                     len 
= strlen(arg
); 
1110         if (len 
&& isalpha(arg
[len 
- 1])) { 
1111                 switch (arg
[len 
- 1]) { 
1129                 for (i 
= 0; i 
< len 
- 1; i
++) 
1130                         if (!isdigit(arg
[i
])) 
1134         return (mult 
* strtoll(arg
, NULL
, 10)); 
1138 sizetoint(const char *arg
) 
1140         int                     len 
= strlen(arg
); 
1144         if (len 
&& isalpha(arg
[len 
- 1])) { 
1145                 switch (arg
[len 
- 1]) { 
1163                 for (i 
= 0; i 
< len 
- 1; i
++) 
1164                         if (!isdigit(arg
[i
])) 
1168         return (mult 
* atoi(arg
)); 
1172 print_bar(long count
, long total
) 
1176         (void) putchar_unlocked(count 
? '*' : ' '); 
1177         for (i 
= 1; i 
< (32 * count
) / total
; i
++) 
1178                 (void) putchar_unlocked('*'); 
1180                 (void) putchar_unlocked(' '); 
1184 doublecmp(const void *p1
, const void *p2
) 
1186         double a 
= *((double *)p1
); 
1187         double b 
= *((double *)p2
); 
1197 print_histo(barrier_t 
*b
) 
1216         (void) printf("#        %12s %12s %32s %12s\n", "counts", "usecs/call", 
1219         /* calculate how much data we've captured */ 
1220         n 
= b
->ba_batches 
> b
->ba_datasize 
? b
->ba_datasize 
: b
->ba_batches
; 
1222         /* find the 95th percentile - index, value and range */ 
1223         qsort((void *)b
->ba_data
, n
, sizeof (double), doublecmp
); 
1224         min 
= b
->ba_data
[0] + 0.000001; 
1226         p95 
= b
->ba_data
[i95
]; 
1227         r95 
= p95 
- min 
+ 1; 
1229         /* find a suitable min and scale */ 
1231         x 
= r95 
/ (HISTOSIZE 
- 1); 
1236         y 
= x 
+ 0.9999999999; 
1243         scale 
= y 
* (HISTOSIZE 
- 1); 
1244         if (scale 
< (HISTOSIZE 
- 1)) { 
1245                 scale 
= (HISTOSIZE 
- 1); 
1248         /* create and initialise the histogram */ 
1249         histo 
= malloc(HISTOSIZE 
* sizeof (histo_t
)); 
1250         for (i 
= 0; i 
< HISTOSIZE
; i
++) { 
1255         /* populate the histogram */ 
1259         for (i 
= 0; i 
< i95
; i
++) { 
1260                 j 
= (HISTOSIZE 
- 1) * (b
->ba_data
[i
] - min
) / scale
; 
1262                 if (j 
>= HISTOSIZE
) { 
1263                         (void) printf("panic!\n"); 
1267                 histo
[j
].sum 
+= b
->ba_data
[i
]; 
1270                 sum 
+= b
->ba_data
[i
]; 
1275         /* find the larges bucket */ 
1277         for (i 
= 0; i 
< HISTOSIZE
; i
++) 
1278                 if (histo
[i
].count 
> 0) { 
1280                         if (histo
[i
].count 
> maxcount
) 
1281                                 maxcount 
= histo
[i
].count
; 
1284         /* print the buckets */ 
1285         for (i 
= 0; i 
<= last
; i
++) { 
1286                 (void) printf("#       %12lld %12.5f |", histo
[i
].count
, 
1287                     (min 
+ scale 
* (double)i 
/ (HISTOSIZE 
- 1))); 
1289                 print_bar(histo
[i
].count
, maxcount
); 
1291                 if (histo
[i
].count 
> 0) 
1292                         (void) printf("%12.5f\n", 
1293                             histo
[i
].sum 
/ histo
[i
].count
); 
1295                         (void) printf("%12s\n", "-"); 
1298         /* find the mean of values beyond the 95th percentile */ 
1301         for (i 
= i95
; i 
< n
; i
++) { 
1302                 sum 
+= b
->ba_data
[i
]; 
1306         /* print the >95% bucket summary */ 
1307         (void) printf("#\n"); 
1308         (void) printf("#       %12lld %12s |", count
, "> 95%"); 
1309         print_bar(count
, maxcount
); 
1311                 (void) printf("%12.5f\n", sum 
/ count
); 
1313                 (void) printf("%12s\n", "-"); 
1314         (void) printf("#\n"); 
1315         (void) printf("#       %12s %12.5f\n", "mean of 95%", m95
); 
1316         (void) printf("#       %12s %12.5f\n", "95th %ile", p95
); 
1318         /* quantify any buffer overflow */ 
1319         if (b
->ba_batches 
> b
->ba_datasize
) 
1320                 (void) printf("#       %12s %12d\n", "data dropped", 
1321                     b
->ba_batches 
- b
->ba_datasize
); 
1325 compute_stats(barrier_t 
*b
) 
1329         if (b
->ba_batches 
> b
->ba_datasize
) 
1330                 b
->ba_batches 
= b
->ba_datasize
; 
1333          * convert to usecs/call 
1336         for (i 
= 0; i 
< b
->ba_batches
; i
++) 
1337                 b
->ba_data
[i
] /= 1000.0; 
1343         (void) crunch_stats(b
->ba_data
, b
->ba_batches
, &b
->ba_raw
); 
1346          * recursively apply 3 sigma rule to remove outliers 
1349         b
->ba_corrected 
= b
->ba_raw
; 
1352         if (b
->ba_batches 
> 40) { /* remove outliers */ 
1356                         removed 
= remove_outliers(b
->ba_data
, b
->ba_batches
, 
1358                         b
->ba_outliers 
+= removed
; 
1359                         b
->ba_batches 
-= removed
; 
1360                         (void) crunch_stats(b
->ba_data
, b
->ba_batches
, 
1362                         } while (removed 
!= 0 && b
->ba_batches 
> 40); 
1368  * routine to compute various statistics on array of doubles. 
1372 crunch_stats(double *data
, int count
, stats_t 
*stats
) 
1385          * first we need the mean 
1390         for (i 
= 0; i 
< count
; i
++) { 
1396         stats
->st_mean 
= mean
; 
1399          * malloc and sort so we can do median 
1402         dupdata 
= malloc(bytes 
= sizeof (double) * count
); 
1403         (void) memcpy(dupdata
, data
, bytes
); 
1404         qsort((void *)dupdata
, count
, sizeof (double), doublecmp
); 
1405         stats
->st_median   
= dupdata
[count
/2]; 
1408          * reuse dupdata to compute time correlation of data to 
1409          * detect interesting time-based trends 
1412         for (i 
= 0; i 
< count
; i
++) 
1413                 dupdata
[i
] = (double)i
; 
1415         (void) fit_line(dupdata
, data
, count
, &a
, &stats
->st_timecorr
); 
1423         stats
->st_min 
= 1.0e99
; /* hard to find portable values */ 
1425         for (i 
= 0; i 
< count
; i
++) { 
1426                 if (data
[i
] > stats
->st_max
) 
1427                         stats
->st_max 
= data
[i
]; 
1428                 if (data
[i
] < stats
->st_min
) 
1429                         stats
->st_min 
= data
[i
]; 
1431                 diff 
= data
[i
] - mean
; 
1433                 sk  
+= diff 
* diff 
* diff
; 
1434                 ku  
+= diff 
* diff 
* diff 
* diff
; 
1437         stats
->st_stddev   
= std 
= sqrt(std
/(double)(count 
- 1)); 
1438         stats
->st_stderr   
= std 
/ sqrt(count
); 
1439         stats
->st_99confidence 
= stats
->st_stderr 
* 2.326; 
1440         stats
->st_skew     
= sk 
/ (std 
* std 
* std
) / (double)(count
); 
1441         stats
->st_kurtosis 
= ku 
/ (std 
* std 
* std 
* std
) / 
1442             (double)(count
) - 3; 
1448  * does a least squares fit to the set of points x, y and 
1449  * fits a line y = a + bx.  Returns a, b 
1453 fit_line(double *x
, double *y
, int count
, double *a
, double *b
) 
1455         double sumx
, sumy
, sumxy
, sumx2
; 
1459         sumx 
= sumy 
= sumxy 
= sumx2 
= 0.0; 
1461         for (i 
= 0; i 
< count
; i
++) { 
1463                 sumx2   
+= x
[i
] * x
[i
]; 
1465                 sumxy   
+= x
[i
] * y
[i
]; 
1468         denom 
= count 
* sumx2 
- sumx 
* sumx
; 
1473         *a 
= (sumy 
* sumx2 
- sumx 
* sumxy
) / denom
; 
1475         *b 
= (count 
* sumxy 
- sumx 
* sumy
) / denom
; 
1481  * empty function for measurement purposes 
1490 #define NSECITER 1000 
1493 get_nsecs_overhead() 
1497         double data
[NSECITER
]; 
1504         (void) getnsecs(); /* warmup */ 
1505         (void) getnsecs(); /* warmup */ 
1506         (void) getnsecs(); /* warmup */ 
1512         for (i 
= 0; i 
< count
; i
++) { 
1514                 data
[i
] = getnsecs() - s
; 
1517         (void) crunch_stats(data
, count
, &stats
); 
1519         while ((outliers 
= remove_outliers(data
, count
, &stats
)) != 0) { 
1521                 (void) crunch_stats(data
, count
, &stats
); 
1524         return ((long long)stats
.st_mean
); 
1529 get_nsecs_resolution() 
1533         int i
, j
, nops
, res
; 
1534         long long start
, stop
; 
1537          * first, figure out how many nops to use 
1538          * to get any delta between time measurements. 
1539          * use a minimum of one. 
1546         stop 
= start 
= getnsecs(); 
1548         for (i 
= 1; i 
< 10000000; i
++) { 
1560          * now collect data at linearly varying intervals 
1563         for (i 
= 0; i 
< 1000; i
++) { 
1565                 for (j 
= nops 
* i
; j
; j
--) 
1568                 y
[i
] = stop 
- start
; 
1572          * find smallest positive difference between samples; 
1573          * this is the timer resolution 
1578         for (i 
= 1; i 
< 1000; i
++) { 
1579                 int diff 
= y
[i
] - y
[i
-1]; 
1581                 if (diff 
> 0 && res 
> diff
) 
1590  * remove any data points from the array more than 3 sigma out 
1594 remove_outliers(double *data
, int count
, stats_t 
*stats
) 
1596         double outmin 
= stats
->st_mean 
- 3 * stats
->st_stddev
; 
1597         double outmax 
= stats
->st_mean 
+ 3 * stats
->st_stddev
; 
1601         for (outliers 
= i 
= j 
= 0; i 
< count
; i
++) 
1602                 if (data
[i
] > outmax 
|| data
[i
] < outmin
) 
1605                         data
[j
++] = data
[i
];