2 * Benchmark VM fault throughput.
3 * This test faults memory for a configurable amount of time across a
4 * configurable number of threads. Currently it only measures zero fill faults.
5 * Currently it supports two variants:
6 * 1. Each thread gets its own vm objects to fault in
7 * 2. Threads share vm objects
9 * We'll add more fault types as we identify problematic user-facing workloads
10 * in macro benchmarks.
12 * Throughput is reported as pages / second using both wall time and cpu time.
13 * CPU time is a more reliable metric for regression testing, but wall time can
14 * highlight blocking in the VM.
16 * Running this benchmark directly is not recommended.
17 * Use fault_throughput.lua which provides a nicer interface and outputs
29 #include <sys/types.h>
30 #include <sys/sysctl.h>
33 * TODO: Make this benchmark runnable on linux so we can do a perf comparison.
34 * We're mostly using POSIX APIs, but we'll need to replace
35 * the sysctls with the /proc equivalents, and replace clock_gettime_nsec_np
36 * with the linux equivalent.
38 #include <mach/mach.h>
40 #include <TargetConditionals.h>
43 #include <stdatomic.h>
45 #include "vm/perf_helpers.h"
47 #if (TARGET_OS_OSX || TARGET_OS_SIMULATOR)
49 * On non-embedded platforms we coalesce vm objects up to 128 MB, so
50 * we make the objects 128 MB on that platform to ensure they're not
51 * merged with anything else.
53 const static size_t kVmObjectSize
= 128 * (1UL << 20);
56 * Embedded platforms don't coalesce vm objects. This number
57 * needs to be big enough that faulting it in dwarfs the cost of dequeuing
58 * it from the work queue, but can't be too large or else we won't be able
59 * to allocate one per thread in the separate-objects benchmark.
61 const static size_t kVmObjectSize
= 4 * (1UL << 20);
62 #endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */
63 static const clockid_t kWallTimeClock
= CLOCK_MONOTONIC_RAW
;
64 static const clockid_t kThreadCPUTimeClock
= CLOCK_THREAD_CPUTIME_ID
;
65 /* These globals are set dynamically during test setup based on sysctls. */
66 static uint64_t kCacheLineSize
= 0;
67 /* The VM page size */
68 static size_t kPageSize
= 0;
71 typedef struct fault_buffer
{
72 unsigned char* fb_start
; /* The start of this buffer. */
73 size_t fb_size
; /* The size of this buffer in bytes. */
76 typedef enum test_variant
{
77 VARIANT_SEPARATE_VM_OBJECTS
,
78 VARIANT_SHARE_VM_OBJECTS
81 typedef struct test_globals
{
82 /* This lock protects: tg_cv, tg_running_count, tg_done, tg_current_iteration, and tg_iterations_completed. */
83 pthread_mutex_t tg_lock
;
85 /* The number of currently running threads */
86 unsigned int tg_running_count
;
87 /* Set during cleanup to indicate that the benchmark is over. */
89 size_t tg_current_iteration
;
90 size_t tg_iterations_completed
;
91 unsigned int tg_num_threads
;
92 test_variant_t tg_variant
;
94 * An array of memory objects to fault in.
95 * This is basically a workqueue of
96 * contiguous chunks of memory that the worker threads
99 fault_buffer_t
*tg_fault_buffer_arr
;
100 size_t tg_fault_buffer_arr_length
;
102 * To avoid false sharing, we pad the test globals with an extra cache line and place the atomic
103 * next_fault_buffer_index size_t after the cache line.
105 __unused
char padding
[];
107 * This field is directly after the padding buffer.
108 * It is used to synchronize access to tg_fault_buffer_arr.
110 //_Atomic size_t tg_next_fault_buffer_index;
113 static const char* kSeparateObjectsArgument
= "separate-objects";
114 static const char* kShareObjectsArgument
= "share-objects";
116 /* Arguments parsed from the command line */
117 typedef struct test_args
{
119 uint64_t duration_seconds
;
120 test_variant_t variant
;
124 /* Get a (wall-time) timestamp in nanoseconds */
125 static uint64_t get_timestamp_ns(void);
126 /* Get the number of cpus on this device. */
127 static unsigned int get_ncpu(void);
129 * Fault in the pages in the given buffer.
131 static void fault_pages(fault_buffer_t
*buffer
, size_t stride
);
132 /* Get a unique fault buffer from the global work queue. */
133 static fault_buffer_t
*get_fault_buffer(test_globals_t
* globals
);
135 * Grabs buffers from the global test structure and faults them in, using this
136 * test variant's stride, until there are no more buffers to grab.
137 * Returns the number of microseconds spent on-cpu.
139 static uint64_t grab_and_fault_pages(test_globals_t
* globals
);
141 static bool worker_thread_iteration_setup(size_t current_iteration
, test_globals_t
*globals
);
142 static void worker_thread_iteration_complete(test_globals_t
*globals
);
144 static void parse_arguments(int argc
, char **argv
, test_args_t
*args
);
146 * Sets up the test globals and spawns the background threads to do the faults.
147 * Returns an array of size `num_threads`
148 * Containing the thread ids of the forked threads.
150 static pthread_t
* setup_test(test_globals_t
*globals
, const test_args_t
*args
, size_t memory_size
, bool verbose
);
151 static test_globals_t
*allocate_test_globals(void);
152 /* Initializes variables in the globals array. */
153 static void init_globals(test_globals_t
*globals
, const test_args_t
*args
);
154 static inline _Atomic
size_t *next_fault_buffer_index_ptr(test_globals_t
*globals
);
156 * Called on the main thread.
157 * Waits for the background threads to be ready, sets up the memory objects,
158 * and then starts a faulting iteration.
159 * Returns the start (wall) time.
161 static uint64_t start_iteration(test_globals_t
* globals
, test_variant_t variant
, bool verbose
);
163 * Called on the main thread.
164 * Waits for the background threads to complete the iteration and cleans up.
165 * Returns the total amount of time spent faulting pages in nanoseconds by all threads thus far.
167 static uint64_t finish_iteration(test_globals_t
*globals
, uint64_t start_time
);
169 * Called on the main thread.
170 * Maps buffers and places them in the work queue.
172 static void setup_memory(test_globals_t
* globals
, test_variant_t variant
);
174 * Dump test results as a csv to stdout.
175 * Use fault_throughput.lua to convert to perfdata.
177 static void output_results(const test_globals_t
*globals
, double walltime_elapsed_seconds
, double cputime_elapsed_seconds
);
178 static void cleanup_test(test_globals_t
*globals
);
180 * Join the background threads and return the total microseconds
181 * of cpu time spent faulting across all of the threads.
182 * Takes ownership of the threads array and frees it.
184 static uint64_t join_background_threads(test_globals_t
*globals
, pthread_t
*threads
);
185 static void unmap_fault_buffers(test_globals_t
*globals
);
187 * Get the stride between each vm object in the fault buffer array.
189 static size_t fault_buffer_stride(const test_globals_t
*globals
);
192 main(int argc
, char **argv
)
194 /* How much memory should the test consume (per-core on the system)? */
195 #if (TARGET_OS_OSX || TARGET_OS_SIMULATOR)
196 static const size_t memory_per_core
= kVmObjectSize
;
198 static const size_t memory_per_core
= 25 * (1UL << 20);
199 #endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */
200 const size_t kMemSize
= memory_per_core
* get_ncpu();
201 test_globals_t
*globals
= allocate_test_globals();
202 /* Total wall-time spent faulting in pages. */
203 uint64_t wall_time_elapsed_ns
= 0;
204 /* Total cpu-time spent faulting in pages */
205 uint64_t cpu_time_faulting_us
= 0;
206 uint64_t start_time_ns
;
208 parse_arguments(argc
, argv
, &args
);
209 pthread_t
* threads
= setup_test(globals
, &args
, kMemSize
, args
.verbose
);
211 /* Keep doing more iterations until we've hit our (wall) time budget */
212 while (wall_time_elapsed_ns
< args
.duration_seconds
* kNumNanosecondsInSecond
) {
213 benchmark_log(args
.verbose
, "----Starting Iteration %lu-----\n", globals
->tg_current_iteration
+ 1);
214 start_time_ns
= start_iteration(globals
, args
.variant
, args
.verbose
);
215 wall_time_elapsed_ns
+= finish_iteration(globals
, start_time_ns
);
216 benchmark_log(args
.verbose
, "----Completed Iteration %lu----\n", globals
->tg_current_iteration
);
219 benchmark_log(args
.verbose
, "Hit time budget\nJoining worker threads\n");
220 cpu_time_faulting_us
= join_background_threads(globals
, threads
);
221 benchmark_log(args
.verbose
, "----End Test Output----\n");
222 output_results(globals
, (double) wall_time_elapsed_ns
/ kNumNanosecondsInSecond
,
223 (double)cpu_time_faulting_us
/ kNumMicrosecondsInSecond
);
224 cleanup_test(globals
);
230 /* The main loop for the worker threads. */
232 faulting_thread(void* arg
)
234 test_globals_t
* globals
= arg
;
235 uint64_t on_cpu_time_faulting
= 0;
236 size_t current_iteration
= 1;
238 bool should_continue
= worker_thread_iteration_setup(current_iteration
, globals
);
239 if (!should_continue
) {
242 on_cpu_time_faulting
+= grab_and_fault_pages(globals
);
243 worker_thread_iteration_complete(globals
);
246 return (void*)on_cpu_time_faulting
;
250 * Called on the worker threads before each iteration to synchronize this
251 * iteration start with the other threads.
252 * Returns true if the iteration should continue, and false if the test is over.
255 worker_thread_iteration_setup(size_t current_iteration
, test_globals_t
*globals
)
257 bool should_continue
= false;
259 // Gate on the other threads being ready to start
260 ret
= pthread_mutex_lock(&globals
->tg_lock
);
262 globals
->tg_running_count
++;
263 if (globals
->tg_running_count
== globals
->tg_num_threads
) {
264 // All the worker threads are running.
265 // Wake up the main thread so that it can ungate the test.
266 ret
= pthread_cond_broadcast(&globals
->tg_cv
);
270 * The main thread will start this iteration by incrementing
271 * tg_current_iteration. Block until that happens.
272 * See start_iteration for the wakeup code.
274 while (!globals
->tg_done
&& globals
->tg_current_iteration
!= current_iteration
) {
275 ret
= pthread_cond_wait(&globals
->tg_cv
, &globals
->tg_lock
);
278 should_continue
= !globals
->tg_done
;
279 ret
= pthread_mutex_unlock(&globals
->tg_lock
);
281 return should_continue
;
285 * Called on the worker threads before each iteration finishes to synchronize
286 * with the other threads.
289 worker_thread_iteration_complete(test_globals_t
*globals
)
292 // Mark ourselves as done and wait for the other threads to finish
293 ret
= pthread_mutex_lock(&globals
->tg_lock
);
295 globals
->tg_running_count
--;
296 if (globals
->tg_running_count
== 0) {
297 // We're the last one to finish. Mark this iteration as completed and wake everyone up.
298 globals
->tg_iterations_completed
++;
299 ret
= pthread_cond_broadcast(&globals
->tg_cv
);
302 // Others are running. Wait for them to finish.
303 while (globals
->tg_iterations_completed
!= globals
->tg_current_iteration
) {
304 ret
= pthread_cond_wait(&globals
->tg_cv
, &globals
->tg_lock
);
308 ret
= pthread_mutex_unlock(&globals
->tg_lock
);
313 fault_pages(fault_buffer_t
*buffer
, size_t stride
)
315 volatile unsigned char val
;
316 for (unsigned char* ptr
= buffer
->fb_start
; ptr
< buffer
->fb_start
+ buffer
->fb_size
; ptr
+= stride
) {
321 static fault_buffer_t
*
322 get_fault_buffer(test_globals_t
* globals
)
324 size_t index
= atomic_fetch_add_explicit(next_fault_buffer_index_ptr(globals
), 1UL, memory_order_acq_rel
);
325 if (index
< globals
->tg_fault_buffer_arr_length
) {
326 return &globals
->tg_fault_buffer_arr
[index
];
332 grab_and_fault_pages(test_globals_t
* globals
)
334 struct timespec start_time
, end_time
;
335 uint64_t nanoseconds_faulting_on_cpu
= 0;
337 size_t stride
= fault_buffer_stride(globals
) * kPageSize
;
339 fault_buffer_t
*object
= get_fault_buffer(globals
);
340 if (object
== NULL
) {
343 ret
= clock_gettime(kThreadCPUTimeClock
, &start_time
);
346 fault_pages(object
, stride
);
348 ret
= clock_gettime(kThreadCPUTimeClock
, &end_time
);
350 nanoseconds_faulting_on_cpu
+= (unsigned long) timespec_difference_us(&end_time
, &start_time
);
352 return nanoseconds_faulting_on_cpu
;
356 start_iteration(test_globals_t
* globals
, test_variant_t variant
, bool verbose
)
360 ret
= pthread_mutex_lock(&globals
->tg_lock
);
362 benchmark_log(verbose
, "Waiting for workers to catch up before starting next iteration.\n");
363 /* Wait until all the threads are ready to go to the next iteration */
364 while (globals
->tg_running_count
!= globals
->tg_num_threads
) {
365 ret
= pthread_cond_wait(&globals
->tg_cv
, &globals
->tg_lock
);
367 benchmark_log(verbose
, "Workers are all caught up\n");
368 setup_memory(globals
, variant
);
369 benchmark_log(verbose
, "Initialized data structures for iteration. Waking workers.\n");
370 /* Grab a timestamp, tick the current iteration, and wake up the worker threads */
371 start_time
= get_timestamp_ns();
372 globals
->tg_current_iteration
++;
373 ret
= pthread_mutex_unlock(&globals
->tg_lock
);
375 ret
= pthread_cond_broadcast(&globals
->tg_cv
);
381 finish_iteration(test_globals_t
* globals
, uint64_t start_time
)
385 ret
= pthread_mutex_lock(&globals
->tg_lock
);
387 while (globals
->tg_iterations_completed
!= globals
->tg_current_iteration
) {
388 ret
= pthread_cond_wait(&globals
->tg_cv
, &globals
->tg_lock
);
390 end_time
= get_timestamp_ns();
391 ret
= pthread_mutex_unlock(&globals
->tg_lock
);
392 unmap_fault_buffers(globals
);
394 return end_time
- start_time
;
398 setup_memory(test_globals_t
* globals
, test_variant_t variant
)
400 size_t stride
= fault_buffer_stride(globals
);
401 for (size_t i
= 0; i
< globals
->tg_fault_buffer_arr_length
; i
+= stride
) {
402 fault_buffer_t
*object
= &globals
->tg_fault_buffer_arr
[i
];
403 object
->fb_start
= mmap_buffer(kVmObjectSize
);
404 object
->fb_size
= kVmObjectSize
;
405 if (variant
== VARIANT_SHARE_VM_OBJECTS
) {
407 * Insert another buffer into the work queue for each thread.
408 * Each buffer starts 1 page past where the previous buffer started into the vm object.
409 * Since each thread strides by the number of threads * the page size they won't fault in the same pages.
411 for (size_t j
= 1; j
< globals
->tg_num_threads
; j
++) {
412 size_t offset
= kPageSize
* j
;
413 fault_buffer_t
*offset_object
= &globals
->tg_fault_buffer_arr
[i
+ j
];
414 offset_object
->fb_start
= object
->fb_start
+ offset
;
415 offset_object
->fb_size
= object
->fb_size
- offset
;
417 } else if (variant
!= VARIANT_SEPARATE_VM_OBJECTS
) {
418 fprintf(stderr
, "Unknown test variant.\n");
422 atomic_store_explicit(next_fault_buffer_index_ptr(globals
), 0, memory_order_release
);
426 unmap_fault_buffers(test_globals_t
* globals
)
428 size_t stride
= fault_buffer_stride(globals
);
429 for (size_t i
= 0; i
< globals
->tg_fault_buffer_arr_length
; i
+= stride
) {
430 fault_buffer_t
*buffer
= &globals
->tg_fault_buffer_arr
[i
];
431 int res
= munmap(buffer
->fb_start
, buffer
->fb_size
);
436 static test_globals_t
*
437 allocate_test_globals()
439 test_globals_t
*globals
= NULL
;
441 if (kCacheLineSize
== 0) {
442 size_t cachelinesize_size
= sizeof(kCacheLineSize
);
443 ret
= sysctlbyname("hw.cachelinesize", &kCacheLineSize
, &cachelinesize_size
, NULL
, 0);
445 assert(kCacheLineSize
> 0);
447 if (kPageSize
== 0) {
448 size_t pagesize_size
= sizeof(kPageSize
);
449 ret
= sysctlbyname("vm.pagesize", &kPageSize
, &pagesize_size
, NULL
, 0);
451 assert(kPageSize
> 0);
453 size_t test_globals_size
= sizeof(test_globals_t
) + kCacheLineSize
+ sizeof(_Atomic
size_t);
454 globals
= malloc(test_globals_size
);
455 assert(globals
!= NULL
);
456 memset(globals
, 0, test_globals_size
);
461 init_globals(test_globals_t
*globals
, const test_args_t
*args
)
463 pthread_mutexattr_t mutex_attrs
;
464 pthread_condattr_t cond_attrs
;
466 memset(globals
, 0, sizeof(test_globals_t
));
468 ret
= pthread_mutexattr_init(&mutex_attrs
);
470 ret
= pthread_mutex_init(&globals
->tg_lock
, &mutex_attrs
);
472 ret
= pthread_condattr_init(&cond_attrs
);
474 ret
= pthread_cond_init(&globals
->tg_cv
, &cond_attrs
);
476 ret
= pthread_mutexattr_destroy(&mutex_attrs
);
478 ret
= pthread_condattr_destroy(&cond_attrs
);
481 globals
->tg_num_threads
= args
->n_threads
;
482 globals
->tg_variant
= args
->variant
;
486 init_fault_buffer_arr(test_globals_t
*globals
, const test_args_t
*args
, size_t memory_size
)
488 if (args
->variant
== VARIANT_SEPARATE_VM_OBJECTS
) {
489 // This variant creates separate vm objects up to memory size bytes total
490 globals
->tg_fault_buffer_arr_length
= memory_size
/ kVmObjectSize
;
491 } else if (args
->variant
== VARIANT_SHARE_VM_OBJECTS
) {
492 // This variant creates separate vm objects up to memory size bytes total
493 // And places a pointer into each vm object for each thread.
494 globals
->tg_fault_buffer_arr_length
= memory_size
/ kVmObjectSize
* globals
->tg_num_threads
;
496 fprintf(stderr
, "Unsupported test variant.\n");
499 // It doesn't make sense to have more threads than elements in the work queue.
500 // NB: Since we scale memory_size by ncpus, this can only happen if the user
501 // tries to run the benchmark with many more threads than cores.
502 assert(globals
->tg_fault_buffer_arr_length
>= globals
->tg_num_threads
);
503 globals
->tg_fault_buffer_arr
= calloc(sizeof(fault_buffer_t
), globals
->tg_fault_buffer_arr_length
);
504 assert(globals
->tg_fault_buffer_arr
);
508 spawn_worker_threads(test_globals_t
*globals
, unsigned int num_threads
)
511 pthread_attr_t pthread_attrs
;
512 globals
->tg_num_threads
= num_threads
;
513 pthread_t
* threads
= malloc(sizeof(pthread_t
) * num_threads
);
515 ret
= pthread_attr_init(&pthread_attrs
);
517 // Spawn the background threads
518 for (unsigned int i
= 0; i
< num_threads
; i
++) {
519 ret
= pthread_create(threads
+ i
, &pthread_attrs
, faulting_thread
, globals
);
522 ret
= pthread_attr_destroy(&pthread_attrs
);
528 setup_test(test_globals_t
*globals
, const test_args_t
*args
, size_t memory_size
, bool verbose
)
530 init_globals(globals
, args
);
531 init_fault_buffer_arr(globals
, args
, memory_size
);
532 benchmark_log(verbose
, "Initialized global data structures.\n");
533 pthread_t
*workers
= spawn_worker_threads(globals
, args
->n_threads
);
534 benchmark_log(verbose
, "Spawned workers.\n");
539 join_background_threads(test_globals_t
*globals
, pthread_t
*threads
)
541 // Set the done flag so that the background threads exit
543 uint64_t total_cputime_spent_faulting
= 0;
544 ret
= pthread_mutex_lock(&globals
->tg_lock
);
546 globals
->tg_done
= true;
547 ret
= pthread_cond_broadcast(&globals
->tg_cv
);
549 ret
= pthread_mutex_unlock(&globals
->tg_lock
);
552 // Join the background threads
553 for (unsigned int i
= 0; i
< globals
->tg_num_threads
; i
++) {
554 uint64_t cputime_spent_faulting
= 0;
555 ret
= pthread_join(threads
[i
], (void **)&cputime_spent_faulting
);
557 total_cputime_spent_faulting
+= cputime_spent_faulting
;
560 return total_cputime_spent_faulting
;
564 cleanup_test(test_globals_t
* globals
)
567 ret
= pthread_mutex_destroy(&globals
->tg_lock
);
569 ret
= pthread_cond_destroy(&globals
->tg_cv
);
571 free(globals
->tg_fault_buffer_arr
);
576 output_results(const test_globals_t
* globals
, double walltime_elapsed_seconds
, double cputime_elapsed_seconds
)
579 size_t sysctl_size
= sizeof(pgsize
);
580 int ret
= sysctlbyname("vm.pagesize", &pgsize
, &sysctl_size
, NULL
, 0);
582 size_t num_pages
= 0;
583 double walltime_throughput
, cputime_throughput
;
584 size_t stride
= fault_buffer_stride(globals
);
585 for (size_t i
= 0; i
< globals
->tg_fault_buffer_arr_length
; i
+= stride
) {
586 num_pages
+= globals
->tg_fault_buffer_arr
[i
].fb_size
/ pgsize
;
588 num_pages
*= globals
->tg_iterations_completed
;
589 walltime_throughput
= num_pages
/ walltime_elapsed_seconds
;
590 cputime_throughput
= num_pages
/ cputime_elapsed_seconds
;
591 printf("-----Results-----\n");
592 printf("Throughput (pages / wall second), Throughput (pages / CPU second)\n");
593 printf("%f,%f\n", walltime_throughput
, cputime_throughput
);
597 print_help(char** argv
)
599 fprintf(stderr
, "%s: <test-variant> [-v] duration num_threads\n", argv
[0]);
600 fprintf(stderr
, "\ntest variants:\n");
601 fprintf(stderr
, " %s Fault in different vm objects in each thread.\n", kSeparateObjectsArgument
);
602 fprintf(stderr
, " %s Share vm objects across faulting threads.\n", kShareObjectsArgument
);
608 return clock_gettime_nsec_np(kWallTimeClock
);
615 size_t sysctl_size
= sizeof(ncpu
);
616 int ret
= sysctlbyname("hw.ncpu", &ncpu
, &sysctl_size
, NULL
, 0);
618 return (unsigned int) ncpu
;
622 parse_arguments(int argc
, char** argv
, test_args_t
*args
)
624 int current_argument
= 1;
625 memset(args
, 0, sizeof(test_args_t
));
626 if (argc
< 4 || argc
> 6) {
630 if (argv
[current_argument
][0] == '-') {
631 if (strcmp(argv
[current_argument
], "-v") == 0) {
632 args
->verbose
= true;
634 fprintf(stderr
, "Unknown argument %s\n", argv
[current_argument
]);
640 if (strncasecmp(argv
[current_argument
], kSeparateObjectsArgument
, strlen(kSeparateObjectsArgument
)) == 0) {
641 args
->variant
= VARIANT_SEPARATE_VM_OBJECTS
;
642 } else if (strncasecmp(argv
[current_argument
], kShareObjectsArgument
, strlen(kShareObjectsArgument
)) == 0) {
643 args
->variant
= VARIANT_SHARE_VM_OBJECTS
;
650 long duration
= strtol(argv
[current_argument
++], NULL
, 10);
655 long num_cores
= strtol(argv
[current_argument
++], NULL
, 10);
656 if (num_cores
== 0) {
660 assert(num_cores
> 0 && num_cores
<= get_ncpu());
661 args
->n_threads
= (unsigned int) num_cores
;
662 args
->duration_seconds
= (unsigned long) duration
;
667 next_fault_buffer_index_ptr(test_globals_t
*globals
)
669 return (_Atomic
size_t *) (((ptrdiff_t)(globals
+ 1)) + (int64_t)kCacheLineSize
);
672 fault_buffer_stride(const test_globals_t
*globals
)
675 if (globals
->tg_variant
== VARIANT_SEPARATE_VM_OBJECTS
) {
677 } else if (globals
->tg_variant
== VARIANT_SHARE_VM_OBJECTS
) {
678 stride
= globals
->tg_num_threads
;
680 fprintf(stderr
, "Unknown variant\n");