]>
Commit | Line | Data |
---|---|---|
f427ee49 A |
1 | /* |
2 | * Benchmark VM fault throughput. | |
3 | * This test faults memory for a configurable amount of time across a | |
4 | * configurable number of threads. Currently it only measures zero fill faults. | |
5 | * Currently it supports two variants: | |
6 | * 1. Each thread gets its own vm objects to fault in | |
7 | * 2. Threads share vm objects | |
8 | * | |
9 | * We'll add more fault types as we identify problematic user-facing workloads | |
10 | * in macro benchmarks. | |
11 | * | |
12 | * Throughput is reported as pages / second using both wall time and cpu time. | |
13 | * CPU time is a more reliable metric for regression testing, but wall time can | |
14 | * highlight blocking in the VM. | |
15 | * | |
16 | * Running this benchmark directly is not recommended. | |
17 | * Use fault_throughput.lua which provides a nicer interface and outputs | |
18 | * perfdata. | |
19 | */ | |
20 | #include <assert.h> | |
21 | #include <ctype.h> | |
22 | #include <errno.h> | |
23 | #include <stdarg.h> | |
24 | #include <stdio.h> | |
25 | #include <stdlib.h> | |
26 | #include <strings.h> | |
27 | ||
28 | #include <sys/mman.h> | |
29 | #include <sys/types.h> | |
30 | #include <sys/sysctl.h> | |
31 | ||
32 | /* | |
33 | * TODO: Make this benchmark runnable on linux so we can do a perf comparison. | |
34 | * We're mostly using POSIX APIs, but we'll need to replace | |
35 | * the sysctls with the /proc equivalents, and replace clock_gettime_nsec_np | |
36 | * with the linux equivalent. | |
37 | */ | |
38 | #include <mach/mach.h> | |
39 | ||
40 | #include <TargetConditionals.h> | |
41 | ||
42 | #include <pthread.h> | |
43 | #include <stdatomic.h> | |
44 | ||
c3c9b80d | 45 | #include "benchmark/helpers.h" |
f427ee49 A |
46 | |
47 | #if (TARGET_OS_OSX || TARGET_OS_SIMULATOR) | |
48 | /* | |
49 | * On non-embedded platforms we coalesce vm objects up to 128 MB, so | |
50 | * we make the objects 128 MB on that platform to ensure they're not | |
51 | * merged with anything else. | |
52 | */ | |
53 | const static size_t kVmObjectSize = 128 * (1UL << 20); | |
54 | #else | |
55 | /* | |
56 | * Embedded platforms don't coalesce vm objects. This number | |
57 | * needs to be big enough that faulting it in dwarfs the cost of dequeuing | |
58 | * it from the work queue, but can't be too large or else we won't be able | |
59 | * to allocate one per thread in the separate-objects benchmark. | |
60 | */ | |
61 | const static size_t kVmObjectSize = 4 * (1UL << 20); | |
62 | #endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */ | |
63 | static const clockid_t kWallTimeClock = CLOCK_MONOTONIC_RAW; | |
64 | static const clockid_t kThreadCPUTimeClock = CLOCK_THREAD_CPUTIME_ID; | |
65 | /* These globals are set dynamically during test setup based on sysctls. */ | |
66 | static uint64_t kCacheLineSize = 0; | |
67 | /* The VM page size */ | |
68 | static size_t kPageSize = 0; | |
69 | ||
70 | ||
71 | typedef struct fault_buffer { | |
72 | unsigned char* fb_start; /* The start of this buffer. */ | |
73 | size_t fb_size; /* The size of this buffer in bytes. */ | |
74 | } fault_buffer_t; | |
75 | ||
76 | typedef enum test_variant { | |
77 | VARIANT_SEPARATE_VM_OBJECTS, | |
78 | VARIANT_SHARE_VM_OBJECTS | |
79 | } test_variant_t; | |
80 | ||
81 | typedef struct test_globals { | |
82 | /* This lock protects: tg_cv, tg_running_count, tg_done, tg_current_iteration, and tg_iterations_completed. */ | |
83 | pthread_mutex_t tg_lock; | |
84 | pthread_cond_t tg_cv; | |
85 | /* The number of currently running threads */ | |
86 | unsigned int tg_running_count; | |
87 | /* Set during cleanup to indicate that the benchmark is over. */ | |
88 | bool tg_done; | |
89 | size_t tg_current_iteration; | |
90 | size_t tg_iterations_completed; | |
91 | unsigned int tg_num_threads; | |
92 | test_variant_t tg_variant; | |
93 | /* | |
94 | * An array of memory objects to fault in. | |
95 | * This is basically a workqueue of | |
96 | * contiguous chunks of memory that the worker threads | |
97 | * will fault in. | |
98 | */ | |
99 | fault_buffer_t *tg_fault_buffer_arr; | |
100 | size_t tg_fault_buffer_arr_length; | |
101 | /* | |
102 | * To avoid false sharing, we pad the test globals with an extra cache line and place the atomic | |
103 | * next_fault_buffer_index size_t after the cache line. | |
104 | */ | |
105 | __unused char padding[]; | |
106 | /* | |
107 | * This field is directly after the padding buffer. | |
108 | * It is used to synchronize access to tg_fault_buffer_arr. | |
109 | */ | |
110 | //_Atomic size_t tg_next_fault_buffer_index; | |
111 | } test_globals_t; | |
112 | ||
113 | static const char* kSeparateObjectsArgument = "separate-objects"; | |
114 | static const char* kShareObjectsArgument = "share-objects"; | |
115 | ||
116 | /* Arguments parsed from the command line */ | |
117 | typedef struct test_args { | |
118 | uint32_t n_threads; | |
119 | uint64_t duration_seconds; | |
120 | test_variant_t variant; | |
121 | bool verbose; | |
122 | } test_args_t; | |
123 | ||
f427ee49 A |
124 | /* |
125 | * Fault in the pages in the given buffer. | |
126 | */ | |
127 | static void fault_pages(fault_buffer_t *buffer, size_t stride); | |
128 | /* Get a unique fault buffer from the global work queue. */ | |
129 | static fault_buffer_t *get_fault_buffer(test_globals_t* globals); | |
130 | /* | |
131 | * Grabs buffers from the global test structure and faults them in, using this | |
132 | * test variant's stride, until there are no more buffers to grab. | |
133 | * Returns the number of microseconds spent on-cpu. | |
134 | */ | |
135 | static uint64_t grab_and_fault_pages(test_globals_t* globals); | |
136 | ||
137 | static bool worker_thread_iteration_setup(size_t current_iteration, test_globals_t *globals); | |
138 | static void worker_thread_iteration_complete(test_globals_t *globals); | |
139 | ||
140 | static void parse_arguments(int argc, char **argv, test_args_t *args); | |
141 | /* | |
142 | * Sets up the test globals and spawns the background threads to do the faults. | |
143 | * Returns an array of size `num_threads` | |
144 | * Containing the thread ids of the forked threads. | |
145 | */ | |
146 | static pthread_t* setup_test(test_globals_t *globals, const test_args_t *args, size_t memory_size, bool verbose); | |
147 | static test_globals_t *allocate_test_globals(void); | |
148 | /* Initializes variables in the globals array. */ | |
149 | static void init_globals(test_globals_t *globals, const test_args_t *args); | |
150 | static inline _Atomic size_t *next_fault_buffer_index_ptr(test_globals_t *globals); | |
151 | /* | |
152 | * Called on the main thread. | |
153 | * Waits for the background threads to be ready, sets up the memory objects, | |
154 | * and then starts a faulting iteration. | |
155 | * Returns the start (wall) time. | |
156 | */ | |
157 | static uint64_t start_iteration(test_globals_t* globals, test_variant_t variant, bool verbose); | |
158 | /* | |
159 | * Called on the main thread. | |
160 | * Waits for the background threads to complete the iteration and cleans up. | |
161 | * Returns the total amount of time spent faulting pages in nanoseconds by all threads thus far. | |
162 | */ | |
163 | static uint64_t finish_iteration(test_globals_t *globals, uint64_t start_time); | |
164 | /* | |
165 | * Called on the main thread. | |
166 | * Maps buffers and places them in the work queue. | |
167 | */ | |
168 | static void setup_memory(test_globals_t* globals, test_variant_t variant); | |
169 | /* | |
170 | * Dump test results as a csv to stdout. | |
171 | * Use fault_throughput.lua to convert to perfdata. | |
172 | */ | |
173 | static void output_results(const test_globals_t *globals, double walltime_elapsed_seconds, double cputime_elapsed_seconds); | |
174 | static void cleanup_test(test_globals_t *globals); | |
175 | /* | |
176 | * Join the background threads and return the total microseconds | |
177 | * of cpu time spent faulting across all of the threads. | |
178 | * Takes ownership of the threads array and frees it. | |
179 | */ | |
180 | static uint64_t join_background_threads(test_globals_t *globals, pthread_t *threads); | |
181 | static void unmap_fault_buffers(test_globals_t *globals); | |
182 | /* | |
183 | * Get the stride between each vm object in the fault buffer array. | |
184 | */ | |
185 | static size_t fault_buffer_stride(const test_globals_t *globals); | |
186 | ||
187 | int | |
188 | main(int argc, char **argv) | |
189 | { | |
190 | /* How much memory should the test consume (per-core on the system)? */ | |
191 | #if (TARGET_OS_OSX || TARGET_OS_SIMULATOR) | |
192 | static const size_t memory_per_core = kVmObjectSize; | |
193 | #else | |
194 | static const size_t memory_per_core = 25 * (1UL << 20); | |
195 | #endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */ | |
c3c9b80d | 196 | const size_t kMemSize = memory_per_core * (size_t) get_ncpu(); |
f427ee49 A |
197 | test_globals_t *globals = allocate_test_globals(); |
198 | /* Total wall-time spent faulting in pages. */ | |
199 | uint64_t wall_time_elapsed_ns = 0; | |
200 | /* Total cpu-time spent faulting in pages */ | |
201 | uint64_t cpu_time_faulting_us = 0; | |
202 | uint64_t start_time_ns; | |
203 | test_args_t args; | |
204 | parse_arguments(argc, argv, &args); | |
205 | pthread_t* threads = setup_test(globals, &args, kMemSize, args.verbose); | |
206 | ||
207 | /* Keep doing more iterations until we've hit our (wall) time budget */ | |
208 | while (wall_time_elapsed_ns < args.duration_seconds * kNumNanosecondsInSecond) { | |
209 | benchmark_log(args.verbose, "----Starting Iteration %lu-----\n", globals->tg_current_iteration + 1); | |
210 | start_time_ns = start_iteration(globals, args.variant, args.verbose); | |
211 | wall_time_elapsed_ns += finish_iteration(globals, start_time_ns); | |
212 | benchmark_log(args.verbose, "----Completed Iteration %lu----\n", globals->tg_current_iteration); | |
213 | } | |
214 | ||
215 | benchmark_log(args.verbose, "Hit time budget\nJoining worker threads\n"); | |
216 | cpu_time_faulting_us = join_background_threads(globals, threads); | |
217 | benchmark_log(args.verbose, "----End Test Output----\n"); | |
218 | output_results(globals, (double) wall_time_elapsed_ns / kNumNanosecondsInSecond, | |
219 | (double)cpu_time_faulting_us / kNumMicrosecondsInSecond); | |
220 | cleanup_test(globals); | |
221 | ||
222 | return 0; | |
223 | } | |
224 | ||
225 | ||
226 | /* The main loop for the worker threads. */ | |
227 | static void* | |
228 | faulting_thread(void* arg) | |
229 | { | |
230 | test_globals_t* globals = arg; | |
231 | uint64_t on_cpu_time_faulting = 0; | |
232 | size_t current_iteration = 1; | |
233 | while (true) { | |
234 | bool should_continue = worker_thread_iteration_setup(current_iteration, globals); | |
235 | if (!should_continue) { | |
236 | break; | |
237 | } | |
238 | on_cpu_time_faulting += grab_and_fault_pages(globals); | |
239 | worker_thread_iteration_complete(globals); | |
240 | current_iteration++; | |
241 | } | |
242 | return (void*)on_cpu_time_faulting; | |
243 | } | |
244 | ||
245 | /* | |
246 | * Called on the worker threads before each iteration to synchronize this | |
247 | * iteration start with the other threads. | |
248 | * Returns true if the iteration should continue, and false if the test is over. | |
249 | */ | |
250 | static bool | |
251 | worker_thread_iteration_setup(size_t current_iteration, test_globals_t *globals) | |
252 | { | |
253 | bool should_continue = false; | |
254 | int ret = 0; | |
255 | // Gate on the other threads being ready to start | |
256 | ret = pthread_mutex_lock(&globals->tg_lock); | |
257 | assert(ret == 0); | |
258 | globals->tg_running_count++; | |
259 | if (globals->tg_running_count == globals->tg_num_threads) { | |
260 | // All the worker threads are running. | |
261 | // Wake up the main thread so that it can ungate the test. | |
262 | ret = pthread_cond_broadcast(&globals->tg_cv); | |
263 | assert(ret == 0); | |
264 | } | |
265 | /* | |
266 | * The main thread will start this iteration by incrementing | |
267 | * tg_current_iteration. Block until that happens. | |
268 | * See start_iteration for the wakeup code. | |
269 | */ | |
270 | while (!globals->tg_done && globals->tg_current_iteration != current_iteration) { | |
271 | ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock); | |
272 | assert(ret == 0); | |
273 | } | |
274 | should_continue = !globals->tg_done; | |
275 | ret = pthread_mutex_unlock(&globals->tg_lock); | |
276 | assert(ret == 0); | |
277 | return should_continue; | |
278 | } | |
279 | ||
280 | /* | |
281 | * Called on the worker threads before each iteration finishes to synchronize | |
282 | * with the other threads. | |
283 | */ | |
284 | static void | |
285 | worker_thread_iteration_complete(test_globals_t *globals) | |
286 | { | |
287 | int ret; | |
288 | // Mark ourselves as done and wait for the other threads to finish | |
289 | ret = pthread_mutex_lock(&globals->tg_lock); | |
290 | assert(ret == 0); | |
291 | globals->tg_running_count--; | |
292 | if (globals->tg_running_count == 0) { | |
293 | // We're the last one to finish. Mark this iteration as completed and wake everyone up. | |
294 | globals->tg_iterations_completed++; | |
295 | ret = pthread_cond_broadcast(&globals->tg_cv); | |
296 | assert(ret == 0); | |
297 | } else { | |
298 | // Others are running. Wait for them to finish. | |
299 | while (globals->tg_iterations_completed != globals->tg_current_iteration) { | |
300 | ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock); | |
301 | assert(ret == 0); | |
302 | } | |
303 | } | |
304 | ret = pthread_mutex_unlock(&globals->tg_lock); | |
305 | assert(ret == 0); | |
306 | } | |
307 | ||
308 | static void | |
309 | fault_pages(fault_buffer_t *buffer, size_t stride) | |
310 | { | |
311 | volatile unsigned char val; | |
312 | for (unsigned char* ptr = buffer->fb_start; ptr < buffer->fb_start + buffer->fb_size; ptr += stride) { | |
313 | val = *ptr; | |
314 | } | |
315 | } | |
316 | ||
317 | static fault_buffer_t * | |
318 | get_fault_buffer(test_globals_t* globals) | |
319 | { | |
320 | size_t index = atomic_fetch_add_explicit(next_fault_buffer_index_ptr(globals), 1UL, memory_order_acq_rel); | |
321 | if (index < globals->tg_fault_buffer_arr_length) { | |
322 | return &globals->tg_fault_buffer_arr[index]; | |
323 | } | |
324 | return NULL; | |
325 | } | |
326 | ||
327 | static uint64_t | |
328 | grab_and_fault_pages(test_globals_t* globals) | |
329 | { | |
330 | struct timespec start_time, end_time; | |
331 | uint64_t nanoseconds_faulting_on_cpu = 0; | |
332 | int ret; | |
333 | size_t stride = fault_buffer_stride(globals) * kPageSize; | |
334 | while (true) { | |
335 | fault_buffer_t *object = get_fault_buffer(globals); | |
336 | if (object == NULL) { | |
337 | break; | |
338 | } | |
339 | ret = clock_gettime(kThreadCPUTimeClock, &start_time); | |
340 | assert(ret == 0); | |
341 | ||
342 | fault_pages(object, stride); | |
343 | ||
344 | ret = clock_gettime(kThreadCPUTimeClock, &end_time); | |
345 | assert(ret == 0); | |
346 | nanoseconds_faulting_on_cpu += (unsigned long) timespec_difference_us(&end_time, &start_time); | |
347 | } | |
348 | return nanoseconds_faulting_on_cpu; | |
349 | } | |
350 | ||
351 | static uint64_t | |
352 | start_iteration(test_globals_t* globals, test_variant_t variant, bool verbose) | |
353 | { | |
354 | int ret; | |
355 | uint64_t start_time; | |
356 | ret = pthread_mutex_lock(&globals->tg_lock); | |
357 | assert(ret == 0); | |
358 | benchmark_log(verbose, "Waiting for workers to catch up before starting next iteration.\n"); | |
359 | /* Wait until all the threads are ready to go to the next iteration */ | |
360 | while (globals->tg_running_count != globals->tg_num_threads) { | |
361 | ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock); | |
362 | } | |
363 | benchmark_log(verbose, "Workers are all caught up\n"); | |
364 | setup_memory(globals, variant); | |
365 | benchmark_log(verbose, "Initialized data structures for iteration. Waking workers.\n"); | |
366 | /* Grab a timestamp, tick the current iteration, and wake up the worker threads */ | |
c3c9b80d | 367 | start_time = current_timestamp_ns(); |
f427ee49 A |
368 | globals->tg_current_iteration++; |
369 | ret = pthread_mutex_unlock(&globals->tg_lock); | |
370 | assert(ret == 0); | |
371 | ret = pthread_cond_broadcast(&globals->tg_cv); | |
372 | assert(ret == 0); | |
373 | return start_time; | |
374 | } | |
375 | ||
376 | static uint64_t | |
377 | finish_iteration(test_globals_t* globals, uint64_t start_time) | |
378 | { | |
379 | int ret; | |
380 | uint64_t end_time; | |
381 | ret = pthread_mutex_lock(&globals->tg_lock); | |
382 | assert(ret == 0); | |
383 | while (globals->tg_iterations_completed != globals->tg_current_iteration) { | |
384 | ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock); | |
385 | } | |
c3c9b80d | 386 | end_time = current_timestamp_ns(); |
f427ee49 A |
387 | ret = pthread_mutex_unlock(&globals->tg_lock); |
388 | unmap_fault_buffers(globals); | |
389 | assert(ret == 0); | |
390 | return end_time - start_time; | |
391 | } | |
392 | ||
393 | static void | |
394 | setup_memory(test_globals_t* globals, test_variant_t variant) | |
395 | { | |
396 | size_t stride = fault_buffer_stride(globals); | |
397 | for (size_t i = 0; i < globals->tg_fault_buffer_arr_length; i += stride) { | |
398 | fault_buffer_t *object = &globals->tg_fault_buffer_arr[i]; | |
399 | object->fb_start = mmap_buffer(kVmObjectSize); | |
400 | object->fb_size = kVmObjectSize; | |
401 | if (variant == VARIANT_SHARE_VM_OBJECTS) { | |
402 | /* | |
403 | * Insert another buffer into the work queue for each thread. | |
404 | * Each buffer starts 1 page past where the previous buffer started into the vm object. | |
405 | * Since each thread strides by the number of threads * the page size they won't fault in the same pages. | |
406 | */ | |
407 | for (size_t j = 1; j < globals->tg_num_threads; j++) { | |
408 | size_t offset = kPageSize * j; | |
409 | fault_buffer_t *offset_object = &globals->tg_fault_buffer_arr[i + j]; | |
410 | offset_object->fb_start = object->fb_start + offset; | |
411 | offset_object->fb_size = object->fb_size - offset; | |
412 | } | |
413 | } else if (variant != VARIANT_SEPARATE_VM_OBJECTS) { | |
414 | fprintf(stderr, "Unknown test variant.\n"); | |
415 | exit(2); | |
416 | } | |
417 | } | |
418 | atomic_store_explicit(next_fault_buffer_index_ptr(globals), 0, memory_order_release); | |
419 | } | |
420 | ||
421 | static void | |
422 | unmap_fault_buffers(test_globals_t* globals) | |
423 | { | |
424 | size_t stride = fault_buffer_stride(globals); | |
425 | for (size_t i = 0; i < globals->tg_fault_buffer_arr_length; i += stride) { | |
426 | fault_buffer_t *buffer = &globals->tg_fault_buffer_arr[i]; | |
427 | int res = munmap(buffer->fb_start, buffer->fb_size); | |
428 | assert(res == 0); | |
429 | } | |
430 | } | |
431 | ||
432 | static test_globals_t * | |
433 | allocate_test_globals() | |
434 | { | |
435 | test_globals_t *globals = NULL; | |
436 | int ret; | |
437 | if (kCacheLineSize == 0) { | |
438 | size_t cachelinesize_size = sizeof(kCacheLineSize); | |
439 | ret = sysctlbyname("hw.cachelinesize", &kCacheLineSize, &cachelinesize_size, NULL, 0); | |
440 | assert(ret == 0); | |
441 | assert(kCacheLineSize > 0); | |
442 | } | |
443 | if (kPageSize == 0) { | |
444 | size_t pagesize_size = sizeof(kPageSize); | |
445 | ret = sysctlbyname("vm.pagesize", &kPageSize, &pagesize_size, NULL, 0); | |
446 | assert(ret == 0); | |
447 | assert(kPageSize > 0); | |
448 | } | |
449 | size_t test_globals_size = sizeof(test_globals_t) + kCacheLineSize + sizeof(_Atomic size_t); | |
450 | globals = malloc(test_globals_size); | |
451 | assert(globals != NULL); | |
452 | memset(globals, 0, test_globals_size); | |
453 | return globals; | |
454 | } | |
455 | ||
456 | static void | |
457 | init_globals(test_globals_t *globals, const test_args_t *args) | |
458 | { | |
459 | pthread_mutexattr_t mutex_attrs; | |
460 | pthread_condattr_t cond_attrs; | |
461 | int ret; | |
462 | memset(globals, 0, sizeof(test_globals_t)); | |
463 | ||
464 | ret = pthread_mutexattr_init(&mutex_attrs); | |
465 | assert(ret == 0); | |
466 | ret = pthread_mutex_init(&globals->tg_lock, &mutex_attrs); | |
467 | assert(ret == 0); | |
468 | ret = pthread_condattr_init(&cond_attrs); | |
469 | assert(ret == 0); | |
470 | ret = pthread_cond_init(&globals->tg_cv, &cond_attrs); | |
471 | assert(ret == 0); | |
472 | ret = pthread_mutexattr_destroy(&mutex_attrs); | |
473 | assert(ret == 0); | |
474 | ret = pthread_condattr_destroy(&cond_attrs); | |
475 | assert(ret == 0); | |
476 | ||
477 | globals->tg_num_threads = args->n_threads; | |
478 | globals->tg_variant = args->variant; | |
479 | } | |
480 | ||
481 | static void | |
482 | init_fault_buffer_arr(test_globals_t *globals, const test_args_t *args, size_t memory_size) | |
483 | { | |
484 | if (args->variant == VARIANT_SEPARATE_VM_OBJECTS) { | |
485 | // This variant creates separate vm objects up to memory size bytes total | |
486 | globals->tg_fault_buffer_arr_length = memory_size / kVmObjectSize; | |
487 | } else if (args->variant == VARIANT_SHARE_VM_OBJECTS) { | |
488 | // This variant creates separate vm objects up to memory size bytes total | |
489 | // And places a pointer into each vm object for each thread. | |
490 | globals->tg_fault_buffer_arr_length = memory_size / kVmObjectSize * globals->tg_num_threads; | |
491 | } else { | |
492 | fprintf(stderr, "Unsupported test variant.\n"); | |
493 | exit(2); | |
494 | } | |
495 | // It doesn't make sense to have more threads than elements in the work queue. | |
496 | // NB: Since we scale memory_size by ncpus, this can only happen if the user | |
497 | // tries to run the benchmark with many more threads than cores. | |
498 | assert(globals->tg_fault_buffer_arr_length >= globals->tg_num_threads); | |
499 | globals->tg_fault_buffer_arr = calloc(sizeof(fault_buffer_t), globals->tg_fault_buffer_arr_length); | |
500 | assert(globals->tg_fault_buffer_arr); | |
501 | } | |
502 | ||
503 | static pthread_t * | |
504 | spawn_worker_threads(test_globals_t *globals, unsigned int num_threads) | |
505 | { | |
506 | int ret; | |
507 | pthread_attr_t pthread_attrs; | |
508 | globals->tg_num_threads = num_threads; | |
509 | pthread_t* threads = malloc(sizeof(pthread_t) * num_threads); | |
510 | assert(threads); | |
511 | ret = pthread_attr_init(&pthread_attrs); | |
512 | assert(ret == 0); | |
513 | // Spawn the background threads | |
514 | for (unsigned int i = 0; i < num_threads; i++) { | |
515 | ret = pthread_create(threads + i, &pthread_attrs, faulting_thread, globals); | |
516 | assert(ret == 0); | |
517 | } | |
518 | ret = pthread_attr_destroy(&pthread_attrs); | |
519 | assert(ret == 0); | |
520 | return threads; | |
521 | } | |
522 | ||
523 | static pthread_t* | |
524 | setup_test(test_globals_t *globals, const test_args_t *args, size_t memory_size, bool verbose) | |
525 | { | |
526 | init_globals(globals, args); | |
527 | init_fault_buffer_arr(globals, args, memory_size); | |
528 | benchmark_log(verbose, "Initialized global data structures.\n"); | |
529 | pthread_t *workers = spawn_worker_threads(globals, args->n_threads); | |
530 | benchmark_log(verbose, "Spawned workers.\n"); | |
531 | return workers; | |
532 | } | |
533 | ||
534 | static uint64_t | |
535 | join_background_threads(test_globals_t *globals, pthread_t *threads) | |
536 | { | |
537 | // Set the done flag so that the background threads exit | |
538 | int ret; | |
539 | uint64_t total_cputime_spent_faulting = 0; | |
540 | ret = pthread_mutex_lock(&globals->tg_lock); | |
541 | assert(ret == 0); | |
542 | globals->tg_done = true; | |
543 | ret = pthread_cond_broadcast(&globals->tg_cv); | |
544 | assert(ret == 0); | |
545 | ret = pthread_mutex_unlock(&globals->tg_lock); | |
546 | assert(ret == 0); | |
547 | ||
548 | // Join the background threads | |
549 | for (unsigned int i = 0; i < globals->tg_num_threads; i++) { | |
550 | uint64_t cputime_spent_faulting = 0; | |
551 | ret = pthread_join(threads[i], (void **)&cputime_spent_faulting); | |
552 | assert(ret == 0); | |
553 | total_cputime_spent_faulting += cputime_spent_faulting; | |
554 | } | |
555 | free(threads); | |
556 | return total_cputime_spent_faulting; | |
557 | } | |
558 | ||
559 | static void | |
560 | cleanup_test(test_globals_t* globals) | |
561 | { | |
562 | int ret; | |
563 | ret = pthread_mutex_destroy(&globals->tg_lock); | |
564 | assert(ret == 0); | |
565 | ret = pthread_cond_destroy(&globals->tg_cv); | |
566 | assert(ret == 0); | |
567 | free(globals->tg_fault_buffer_arr); | |
568 | free(globals); | |
569 | } | |
570 | ||
571 | static void | |
572 | output_results(const test_globals_t* globals, double walltime_elapsed_seconds, double cputime_elapsed_seconds) | |
573 | { | |
574 | size_t pgsize; | |
575 | size_t sysctl_size = sizeof(pgsize); | |
576 | int ret = sysctlbyname("vm.pagesize", &pgsize, &sysctl_size, NULL, 0); | |
577 | assert(ret == 0); | |
578 | size_t num_pages = 0; | |
579 | double walltime_throughput, cputime_throughput; | |
580 | size_t stride = fault_buffer_stride(globals); | |
581 | for (size_t i = 0; i < globals->tg_fault_buffer_arr_length; i += stride) { | |
582 | num_pages += globals->tg_fault_buffer_arr[i].fb_size / pgsize; | |
583 | } | |
584 | num_pages *= globals->tg_iterations_completed; | |
585 | walltime_throughput = num_pages / walltime_elapsed_seconds; | |
586 | cputime_throughput = num_pages / cputime_elapsed_seconds; | |
587 | printf("-----Results-----\n"); | |
588 | printf("Throughput (pages / wall second), Throughput (pages / CPU second)\n"); | |
589 | printf("%f,%f\n", walltime_throughput, cputime_throughput); | |
590 | } | |
591 | ||
592 | static void | |
593 | print_help(char** argv) | |
594 | { | |
595 | fprintf(stderr, "%s: <test-variant> [-v] duration num_threads\n", argv[0]); | |
596 | fprintf(stderr, "\ntest variants:\n"); | |
597 | fprintf(stderr, " %s Fault in different vm objects in each thread.\n", kSeparateObjectsArgument); | |
598 | fprintf(stderr, " %s Share vm objects across faulting threads.\n", kShareObjectsArgument); | |
599 | } | |
600 | ||
f427ee49 A |
601 | static void |
602 | parse_arguments(int argc, char** argv, test_args_t *args) | |
603 | { | |
604 | int current_argument = 1; | |
605 | memset(args, 0, sizeof(test_args_t)); | |
606 | if (argc < 4 || argc > 6) { | |
607 | print_help(argv); | |
608 | exit(1); | |
609 | } | |
610 | if (argv[current_argument][0] == '-') { | |
611 | if (strcmp(argv[current_argument], "-v") == 0) { | |
612 | args->verbose = true; | |
613 | } else { | |
614 | fprintf(stderr, "Unknown argument %s\n", argv[current_argument]); | |
615 | print_help(argv); | |
616 | exit(1); | |
617 | } | |
618 | current_argument++; | |
619 | } | |
620 | if (strncasecmp(argv[current_argument], kSeparateObjectsArgument, strlen(kSeparateObjectsArgument)) == 0) { | |
621 | args->variant = VARIANT_SEPARATE_VM_OBJECTS; | |
622 | } else if (strncasecmp(argv[current_argument], kShareObjectsArgument, strlen(kShareObjectsArgument)) == 0) { | |
623 | args->variant = VARIANT_SHARE_VM_OBJECTS; | |
624 | } else { | |
625 | print_help(argv); | |
626 | exit(1); | |
627 | } | |
628 | current_argument++; | |
629 | ||
630 | long duration = strtol(argv[current_argument++], NULL, 10); | |
631 | if (duration == 0) { | |
632 | print_help(argv); | |
633 | exit(1); | |
634 | } | |
635 | long num_cores = strtol(argv[current_argument++], NULL, 10); | |
636 | if (num_cores == 0) { | |
637 | print_help(argv); | |
638 | exit(1); | |
639 | } | |
640 | assert(num_cores > 0 && num_cores <= get_ncpu()); | |
641 | args->n_threads = (unsigned int) num_cores; | |
642 | args->duration_seconds = (unsigned long) duration; | |
643 | } | |
644 | ||
645 | static inline | |
646 | _Atomic size_t * | |
647 | next_fault_buffer_index_ptr(test_globals_t *globals) | |
648 | { | |
649 | return (_Atomic size_t *) (((ptrdiff_t)(globals + 1)) + (int64_t)kCacheLineSize); | |
650 | } | |
651 | static size_t | |
652 | fault_buffer_stride(const test_globals_t *globals) | |
653 | { | |
654 | size_t stride; | |
655 | if (globals->tg_variant == VARIANT_SEPARATE_VM_OBJECTS) { | |
656 | stride = 1; | |
657 | } else if (globals->tg_variant == VARIANT_SHARE_VM_OBJECTS) { | |
658 | stride = globals->tg_num_threads; | |
659 | } else { | |
660 | fprintf(stderr, "Unknown variant\n"); | |
661 | exit(-1); | |
662 | } | |
663 | return stride; | |
664 | } |