2 * Benchmark VM fault throughput.
3 * This test faults memory for a configurable amount of time across a
4 * configurable number of threads. Currently it only measures zero fill faults.
5 * Currently it supports two variants:
6 * 1. Each thread gets its own vm objects to fault in
7 * 2. Threads share vm objects
9 * We'll add more fault types as we identify problematic user-facing workloads
10 * in macro benchmarks.
12 * Throughput is reported as pages / second using both wall time and cpu time.
13 * CPU time is a more reliable metric for regression testing, but wall time can
14 * highlight blocking in the VM.
16 * Running this benchmark directly is not recommended.
17 * Use fault_throughput.lua which provides a nicer interface and outputs
29 #include <sys/types.h>
30 #include <sys/sysctl.h>
33 * TODO: Make this benchmark runnable on linux so we can do a perf comparison.
34 * We're mostly using POSIX APIs, but we'll need to replace
35 * the sysctls with the /proc equivalents, and replace clock_gettime_nsec_np
36 * with the linux equivalent.
38 #include <mach/mach.h>
40 #include <TargetConditionals.h>
43 #include <stdatomic.h>
45 #include "benchmark/helpers.h"
47 #if (TARGET_OS_OSX || TARGET_OS_SIMULATOR)
49 * On non-embedded platforms we coalesce vm objects up to 128 MB, so
50 * we make the objects 128 MB on that platform to ensure they're not
51 * merged with anything else.
53 const static size_t kVmObjectSize
= 128 * (1UL << 20);
56 * Embedded platforms don't coalesce vm objects. This number
57 * needs to be big enough that faulting it in dwarfs the cost of dequeuing
58 * it from the work queue, but can't be too large or else we won't be able
59 * to allocate one per thread in the separate-objects benchmark.
61 const static size_t kVmObjectSize
= 4 * (1UL << 20);
62 #endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */
63 static const clockid_t kWallTimeClock
= CLOCK_MONOTONIC_RAW
;
64 static const clockid_t kThreadCPUTimeClock
= CLOCK_THREAD_CPUTIME_ID
;
65 /* These globals are set dynamically during test setup based on sysctls. */
66 static uint64_t kCacheLineSize
= 0;
67 /* The VM page size */
68 static size_t kPageSize
= 0;
71 typedef struct fault_buffer
{
72 unsigned char* fb_start
; /* The start of this buffer. */
73 size_t fb_size
; /* The size of this buffer in bytes. */
76 typedef enum test_variant
{
77 VARIANT_SEPARATE_VM_OBJECTS
,
78 VARIANT_SHARE_VM_OBJECTS
81 typedef struct test_globals
{
82 /* This lock protects: tg_cv, tg_running_count, tg_done, tg_current_iteration, and tg_iterations_completed. */
83 pthread_mutex_t tg_lock
;
85 /* The number of currently running threads */
86 unsigned int tg_running_count
;
87 /* Set during cleanup to indicate that the benchmark is over. */
89 size_t tg_current_iteration
;
90 size_t tg_iterations_completed
;
91 unsigned int tg_num_threads
;
92 test_variant_t tg_variant
;
94 * An array of memory objects to fault in.
95 * This is basically a workqueue of
96 * contiguous chunks of memory that the worker threads
99 fault_buffer_t
*tg_fault_buffer_arr
;
100 size_t tg_fault_buffer_arr_length
;
102 * To avoid false sharing, we pad the test globals with an extra cache line and place the atomic
103 * next_fault_buffer_index size_t after the cache line.
105 __unused
char padding
[];
107 * This field is directly after the padding buffer.
108 * It is used to synchronize access to tg_fault_buffer_arr.
110 //_Atomic size_t tg_next_fault_buffer_index;
113 static const char* kSeparateObjectsArgument
= "separate-objects";
114 static const char* kShareObjectsArgument
= "share-objects";
116 /* Arguments parsed from the command line */
117 typedef struct test_args
{
119 uint64_t duration_seconds
;
120 test_variant_t variant
;
125 * Fault in the pages in the given buffer.
127 static void fault_pages(fault_buffer_t
*buffer
, size_t stride
);
128 /* Get a unique fault buffer from the global work queue. */
129 static fault_buffer_t
*get_fault_buffer(test_globals_t
* globals
);
131 * Grabs buffers from the global test structure and faults them in, using this
132 * test variant's stride, until there are no more buffers to grab.
133 * Returns the number of microseconds spent on-cpu.
135 static uint64_t grab_and_fault_pages(test_globals_t
* globals
);
137 static bool worker_thread_iteration_setup(size_t current_iteration
, test_globals_t
*globals
);
138 static void worker_thread_iteration_complete(test_globals_t
*globals
);
140 static void parse_arguments(int argc
, char **argv
, test_args_t
*args
);
142 * Sets up the test globals and spawns the background threads to do the faults.
143 * Returns an array of size `num_threads`
144 * Containing the thread ids of the forked threads.
146 static pthread_t
* setup_test(test_globals_t
*globals
, const test_args_t
*args
, size_t memory_size
, bool verbose
);
147 static test_globals_t
*allocate_test_globals(void);
148 /* Initializes variables in the globals array. */
149 static void init_globals(test_globals_t
*globals
, const test_args_t
*args
);
150 static inline _Atomic
size_t *next_fault_buffer_index_ptr(test_globals_t
*globals
);
152 * Called on the main thread.
153 * Waits for the background threads to be ready, sets up the memory objects,
154 * and then starts a faulting iteration.
155 * Returns the start (wall) time.
157 static uint64_t start_iteration(test_globals_t
* globals
, test_variant_t variant
, bool verbose
);
159 * Called on the main thread.
160 * Waits for the background threads to complete the iteration and cleans up.
161 * Returns the total amount of time spent faulting pages in nanoseconds by all threads thus far.
163 static uint64_t finish_iteration(test_globals_t
*globals
, uint64_t start_time
);
165 * Called on the main thread.
166 * Maps buffers and places them in the work queue.
168 static void setup_memory(test_globals_t
* globals
, test_variant_t variant
);
170 * Dump test results as a csv to stdout.
171 * Use fault_throughput.lua to convert to perfdata.
173 static void output_results(const test_globals_t
*globals
, double walltime_elapsed_seconds
, double cputime_elapsed_seconds
);
174 static void cleanup_test(test_globals_t
*globals
);
176 * Join the background threads and return the total microseconds
177 * of cpu time spent faulting across all of the threads.
178 * Takes ownership of the threads array and frees it.
180 static uint64_t join_background_threads(test_globals_t
*globals
, pthread_t
*threads
);
181 static void unmap_fault_buffers(test_globals_t
*globals
);
183 * Get the stride between each vm object in the fault buffer array.
185 static size_t fault_buffer_stride(const test_globals_t
*globals
);
188 main(int argc
, char **argv
)
190 /* How much memory should the test consume (per-core on the system)? */
191 #if (TARGET_OS_OSX || TARGET_OS_SIMULATOR)
192 static const size_t memory_per_core
= kVmObjectSize
;
194 static const size_t memory_per_core
= 25 * (1UL << 20);
195 #endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */
196 const size_t kMemSize
= memory_per_core
* (size_t) get_ncpu();
197 test_globals_t
*globals
= allocate_test_globals();
198 /* Total wall-time spent faulting in pages. */
199 uint64_t wall_time_elapsed_ns
= 0;
200 /* Total cpu-time spent faulting in pages */
201 uint64_t cpu_time_faulting_us
= 0;
202 uint64_t start_time_ns
;
204 parse_arguments(argc
, argv
, &args
);
205 pthread_t
* threads
= setup_test(globals
, &args
, kMemSize
, args
.verbose
);
207 /* Keep doing more iterations until we've hit our (wall) time budget */
208 while (wall_time_elapsed_ns
< args
.duration_seconds
* kNumNanosecondsInSecond
) {
209 benchmark_log(args
.verbose
, "----Starting Iteration %lu-----\n", globals
->tg_current_iteration
+ 1);
210 start_time_ns
= start_iteration(globals
, args
.variant
, args
.verbose
);
211 wall_time_elapsed_ns
+= finish_iteration(globals
, start_time_ns
);
212 benchmark_log(args
.verbose
, "----Completed Iteration %lu----\n", globals
->tg_current_iteration
);
215 benchmark_log(args
.verbose
, "Hit time budget\nJoining worker threads\n");
216 cpu_time_faulting_us
= join_background_threads(globals
, threads
);
217 benchmark_log(args
.verbose
, "----End Test Output----\n");
218 output_results(globals
, (double) wall_time_elapsed_ns
/ kNumNanosecondsInSecond
,
219 (double)cpu_time_faulting_us
/ kNumMicrosecondsInSecond
);
220 cleanup_test(globals
);
226 /* The main loop for the worker threads. */
228 faulting_thread(void* arg
)
230 test_globals_t
* globals
= arg
;
231 uint64_t on_cpu_time_faulting
= 0;
232 size_t current_iteration
= 1;
234 bool should_continue
= worker_thread_iteration_setup(current_iteration
, globals
);
235 if (!should_continue
) {
238 on_cpu_time_faulting
+= grab_and_fault_pages(globals
);
239 worker_thread_iteration_complete(globals
);
242 return (void*)on_cpu_time_faulting
;
246 * Called on the worker threads before each iteration to synchronize this
247 * iteration start with the other threads.
248 * Returns true if the iteration should continue, and false if the test is over.
251 worker_thread_iteration_setup(size_t current_iteration
, test_globals_t
*globals
)
253 bool should_continue
= false;
255 // Gate on the other threads being ready to start
256 ret
= pthread_mutex_lock(&globals
->tg_lock
);
258 globals
->tg_running_count
++;
259 if (globals
->tg_running_count
== globals
->tg_num_threads
) {
260 // All the worker threads are running.
261 // Wake up the main thread so that it can ungate the test.
262 ret
= pthread_cond_broadcast(&globals
->tg_cv
);
266 * The main thread will start this iteration by incrementing
267 * tg_current_iteration. Block until that happens.
268 * See start_iteration for the wakeup code.
270 while (!globals
->tg_done
&& globals
->tg_current_iteration
!= current_iteration
) {
271 ret
= pthread_cond_wait(&globals
->tg_cv
, &globals
->tg_lock
);
274 should_continue
= !globals
->tg_done
;
275 ret
= pthread_mutex_unlock(&globals
->tg_lock
);
277 return should_continue
;
281 * Called on the worker threads before each iteration finishes to synchronize
282 * with the other threads.
285 worker_thread_iteration_complete(test_globals_t
*globals
)
288 // Mark ourselves as done and wait for the other threads to finish
289 ret
= pthread_mutex_lock(&globals
->tg_lock
);
291 globals
->tg_running_count
--;
292 if (globals
->tg_running_count
== 0) {
293 // We're the last one to finish. Mark this iteration as completed and wake everyone up.
294 globals
->tg_iterations_completed
++;
295 ret
= pthread_cond_broadcast(&globals
->tg_cv
);
298 // Others are running. Wait for them to finish.
299 while (globals
->tg_iterations_completed
!= globals
->tg_current_iteration
) {
300 ret
= pthread_cond_wait(&globals
->tg_cv
, &globals
->tg_lock
);
304 ret
= pthread_mutex_unlock(&globals
->tg_lock
);
309 fault_pages(fault_buffer_t
*buffer
, size_t stride
)
311 volatile unsigned char val
;
312 for (unsigned char* ptr
= buffer
->fb_start
; ptr
< buffer
->fb_start
+ buffer
->fb_size
; ptr
+= stride
) {
317 static fault_buffer_t
*
318 get_fault_buffer(test_globals_t
* globals
)
320 size_t index
= atomic_fetch_add_explicit(next_fault_buffer_index_ptr(globals
), 1UL, memory_order_acq_rel
);
321 if (index
< globals
->tg_fault_buffer_arr_length
) {
322 return &globals
->tg_fault_buffer_arr
[index
];
328 grab_and_fault_pages(test_globals_t
* globals
)
330 struct timespec start_time
, end_time
;
331 uint64_t nanoseconds_faulting_on_cpu
= 0;
333 size_t stride
= fault_buffer_stride(globals
) * kPageSize
;
335 fault_buffer_t
*object
= get_fault_buffer(globals
);
336 if (object
== NULL
) {
339 ret
= clock_gettime(kThreadCPUTimeClock
, &start_time
);
342 fault_pages(object
, stride
);
344 ret
= clock_gettime(kThreadCPUTimeClock
, &end_time
);
346 nanoseconds_faulting_on_cpu
+= (unsigned long) timespec_difference_us(&end_time
, &start_time
);
348 return nanoseconds_faulting_on_cpu
;
352 start_iteration(test_globals_t
* globals
, test_variant_t variant
, bool verbose
)
356 ret
= pthread_mutex_lock(&globals
->tg_lock
);
358 benchmark_log(verbose
, "Waiting for workers to catch up before starting next iteration.\n");
359 /* Wait until all the threads are ready to go to the next iteration */
360 while (globals
->tg_running_count
!= globals
->tg_num_threads
) {
361 ret
= pthread_cond_wait(&globals
->tg_cv
, &globals
->tg_lock
);
363 benchmark_log(verbose
, "Workers are all caught up\n");
364 setup_memory(globals
, variant
);
365 benchmark_log(verbose
, "Initialized data structures for iteration. Waking workers.\n");
366 /* Grab a timestamp, tick the current iteration, and wake up the worker threads */
367 start_time
= current_timestamp_ns();
368 globals
->tg_current_iteration
++;
369 ret
= pthread_mutex_unlock(&globals
->tg_lock
);
371 ret
= pthread_cond_broadcast(&globals
->tg_cv
);
377 finish_iteration(test_globals_t
* globals
, uint64_t start_time
)
381 ret
= pthread_mutex_lock(&globals
->tg_lock
);
383 while (globals
->tg_iterations_completed
!= globals
->tg_current_iteration
) {
384 ret
= pthread_cond_wait(&globals
->tg_cv
, &globals
->tg_lock
);
386 end_time
= current_timestamp_ns();
387 ret
= pthread_mutex_unlock(&globals
->tg_lock
);
388 unmap_fault_buffers(globals
);
390 return end_time
- start_time
;
394 setup_memory(test_globals_t
* globals
, test_variant_t variant
)
396 size_t stride
= fault_buffer_stride(globals
);
397 for (size_t i
= 0; i
< globals
->tg_fault_buffer_arr_length
; i
+= stride
) {
398 fault_buffer_t
*object
= &globals
->tg_fault_buffer_arr
[i
];
399 object
->fb_start
= mmap_buffer(kVmObjectSize
);
400 object
->fb_size
= kVmObjectSize
;
401 if (variant
== VARIANT_SHARE_VM_OBJECTS
) {
403 * Insert another buffer into the work queue for each thread.
404 * Each buffer starts 1 page past where the previous buffer started into the vm object.
405 * Since each thread strides by the number of threads * the page size they won't fault in the same pages.
407 for (size_t j
= 1; j
< globals
->tg_num_threads
; j
++) {
408 size_t offset
= kPageSize
* j
;
409 fault_buffer_t
*offset_object
= &globals
->tg_fault_buffer_arr
[i
+ j
];
410 offset_object
->fb_start
= object
->fb_start
+ offset
;
411 offset_object
->fb_size
= object
->fb_size
- offset
;
413 } else if (variant
!= VARIANT_SEPARATE_VM_OBJECTS
) {
414 fprintf(stderr
, "Unknown test variant.\n");
418 atomic_store_explicit(next_fault_buffer_index_ptr(globals
), 0, memory_order_release
);
422 unmap_fault_buffers(test_globals_t
* globals
)
424 size_t stride
= fault_buffer_stride(globals
);
425 for (size_t i
= 0; i
< globals
->tg_fault_buffer_arr_length
; i
+= stride
) {
426 fault_buffer_t
*buffer
= &globals
->tg_fault_buffer_arr
[i
];
427 int res
= munmap(buffer
->fb_start
, buffer
->fb_size
);
432 static test_globals_t
*
433 allocate_test_globals()
435 test_globals_t
*globals
= NULL
;
437 if (kCacheLineSize
== 0) {
438 size_t cachelinesize_size
= sizeof(kCacheLineSize
);
439 ret
= sysctlbyname("hw.cachelinesize", &kCacheLineSize
, &cachelinesize_size
, NULL
, 0);
441 assert(kCacheLineSize
> 0);
443 if (kPageSize
== 0) {
444 size_t pagesize_size
= sizeof(kPageSize
);
445 ret
= sysctlbyname("vm.pagesize", &kPageSize
, &pagesize_size
, NULL
, 0);
447 assert(kPageSize
> 0);
449 size_t test_globals_size
= sizeof(test_globals_t
) + kCacheLineSize
+ sizeof(_Atomic
size_t);
450 globals
= malloc(test_globals_size
);
451 assert(globals
!= NULL
);
452 memset(globals
, 0, test_globals_size
);
457 init_globals(test_globals_t
*globals
, const test_args_t
*args
)
459 pthread_mutexattr_t mutex_attrs
;
460 pthread_condattr_t cond_attrs
;
462 memset(globals
, 0, sizeof(test_globals_t
));
464 ret
= pthread_mutexattr_init(&mutex_attrs
);
466 ret
= pthread_mutex_init(&globals
->tg_lock
, &mutex_attrs
);
468 ret
= pthread_condattr_init(&cond_attrs
);
470 ret
= pthread_cond_init(&globals
->tg_cv
, &cond_attrs
);
472 ret
= pthread_mutexattr_destroy(&mutex_attrs
);
474 ret
= pthread_condattr_destroy(&cond_attrs
);
477 globals
->tg_num_threads
= args
->n_threads
;
478 globals
->tg_variant
= args
->variant
;
482 init_fault_buffer_arr(test_globals_t
*globals
, const test_args_t
*args
, size_t memory_size
)
484 if (args
->variant
== VARIANT_SEPARATE_VM_OBJECTS
) {
485 // This variant creates separate vm objects up to memory size bytes total
486 globals
->tg_fault_buffer_arr_length
= memory_size
/ kVmObjectSize
;
487 } else if (args
->variant
== VARIANT_SHARE_VM_OBJECTS
) {
488 // This variant creates separate vm objects up to memory size bytes total
489 // And places a pointer into each vm object for each thread.
490 globals
->tg_fault_buffer_arr_length
= memory_size
/ kVmObjectSize
* globals
->tg_num_threads
;
492 fprintf(stderr
, "Unsupported test variant.\n");
495 // It doesn't make sense to have more threads than elements in the work queue.
496 // NB: Since we scale memory_size by ncpus, this can only happen if the user
497 // tries to run the benchmark with many more threads than cores.
498 assert(globals
->tg_fault_buffer_arr_length
>= globals
->tg_num_threads
);
499 globals
->tg_fault_buffer_arr
= calloc(sizeof(fault_buffer_t
), globals
->tg_fault_buffer_arr_length
);
500 assert(globals
->tg_fault_buffer_arr
);
504 spawn_worker_threads(test_globals_t
*globals
, unsigned int num_threads
)
507 pthread_attr_t pthread_attrs
;
508 globals
->tg_num_threads
= num_threads
;
509 pthread_t
* threads
= malloc(sizeof(pthread_t
) * num_threads
);
511 ret
= pthread_attr_init(&pthread_attrs
);
513 // Spawn the background threads
514 for (unsigned int i
= 0; i
< num_threads
; i
++) {
515 ret
= pthread_create(threads
+ i
, &pthread_attrs
, faulting_thread
, globals
);
518 ret
= pthread_attr_destroy(&pthread_attrs
);
524 setup_test(test_globals_t
*globals
, const test_args_t
*args
, size_t memory_size
, bool verbose
)
526 init_globals(globals
, args
);
527 init_fault_buffer_arr(globals
, args
, memory_size
);
528 benchmark_log(verbose
, "Initialized global data structures.\n");
529 pthread_t
*workers
= spawn_worker_threads(globals
, args
->n_threads
);
530 benchmark_log(verbose
, "Spawned workers.\n");
535 join_background_threads(test_globals_t
*globals
, pthread_t
*threads
)
537 // Set the done flag so that the background threads exit
539 uint64_t total_cputime_spent_faulting
= 0;
540 ret
= pthread_mutex_lock(&globals
->tg_lock
);
542 globals
->tg_done
= true;
543 ret
= pthread_cond_broadcast(&globals
->tg_cv
);
545 ret
= pthread_mutex_unlock(&globals
->tg_lock
);
548 // Join the background threads
549 for (unsigned int i
= 0; i
< globals
->tg_num_threads
; i
++) {
550 uint64_t cputime_spent_faulting
= 0;
551 ret
= pthread_join(threads
[i
], (void **)&cputime_spent_faulting
);
553 total_cputime_spent_faulting
+= cputime_spent_faulting
;
556 return total_cputime_spent_faulting
;
560 cleanup_test(test_globals_t
* globals
)
563 ret
= pthread_mutex_destroy(&globals
->tg_lock
);
565 ret
= pthread_cond_destroy(&globals
->tg_cv
);
567 free(globals
->tg_fault_buffer_arr
);
572 output_results(const test_globals_t
* globals
, double walltime_elapsed_seconds
, double cputime_elapsed_seconds
)
575 size_t sysctl_size
= sizeof(pgsize
);
576 int ret
= sysctlbyname("vm.pagesize", &pgsize
, &sysctl_size
, NULL
, 0);
578 size_t num_pages
= 0;
579 double walltime_throughput
, cputime_throughput
;
580 size_t stride
= fault_buffer_stride(globals
);
581 for (size_t i
= 0; i
< globals
->tg_fault_buffer_arr_length
; i
+= stride
) {
582 num_pages
+= globals
->tg_fault_buffer_arr
[i
].fb_size
/ pgsize
;
584 num_pages
*= globals
->tg_iterations_completed
;
585 walltime_throughput
= num_pages
/ walltime_elapsed_seconds
;
586 cputime_throughput
= num_pages
/ cputime_elapsed_seconds
;
587 printf("-----Results-----\n");
588 printf("Throughput (pages / wall second), Throughput (pages / CPU second)\n");
589 printf("%f,%f\n", walltime_throughput
, cputime_throughput
);
593 print_help(char** argv
)
595 fprintf(stderr
, "%s: <test-variant> [-v] duration num_threads\n", argv
[0]);
596 fprintf(stderr
, "\ntest variants:\n");
597 fprintf(stderr
, " %s Fault in different vm objects in each thread.\n", kSeparateObjectsArgument
);
598 fprintf(stderr
, " %s Share vm objects across faulting threads.\n", kShareObjectsArgument
);
602 parse_arguments(int argc
, char** argv
, test_args_t
*args
)
604 int current_argument
= 1;
605 memset(args
, 0, sizeof(test_args_t
));
606 if (argc
< 4 || argc
> 6) {
610 if (argv
[current_argument
][0] == '-') {
611 if (strcmp(argv
[current_argument
], "-v") == 0) {
612 args
->verbose
= true;
614 fprintf(stderr
, "Unknown argument %s\n", argv
[current_argument
]);
620 if (strncasecmp(argv
[current_argument
], kSeparateObjectsArgument
, strlen(kSeparateObjectsArgument
)) == 0) {
621 args
->variant
= VARIANT_SEPARATE_VM_OBJECTS
;
622 } else if (strncasecmp(argv
[current_argument
], kShareObjectsArgument
, strlen(kShareObjectsArgument
)) == 0) {
623 args
->variant
= VARIANT_SHARE_VM_OBJECTS
;
630 long duration
= strtol(argv
[current_argument
++], NULL
, 10);
635 long num_cores
= strtol(argv
[current_argument
++], NULL
, 10);
636 if (num_cores
== 0) {
640 assert(num_cores
> 0 && num_cores
<= get_ncpu());
641 args
->n_threads
= (unsigned int) num_cores
;
642 args
->duration_seconds
= (unsigned long) duration
;
647 next_fault_buffer_index_ptr(test_globals_t
*globals
)
649 return (_Atomic
size_t *) (((ptrdiff_t)(globals
+ 1)) + (int64_t)kCacheLineSize
);
652 fault_buffer_stride(const test_globals_t
*globals
)
655 if (globals
->tg_variant
== VARIANT_SEPARATE_VM_OBJECTS
) {
657 } else if (globals
->tg_variant
== VARIANT_SHARE_VM_OBJECTS
) {
658 stride
= globals
->tg_num_threads
;
660 fprintf(stderr
, "Unknown variant\n");