src/once.c

   1 /*
   2  * Copyright (c) 2008-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_APACHE_LICENSE_HEADER_START@
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  *
  18  * @APPLE_APACHE_LICENSE_HEADER_END@
  19  */
  20
  21 #include "internal.h"
  22
  23 #undef dispatch_once
  24 #undef dispatch_once_f
  25
  26
  27 typedef struct _dispatch_once_waiter_s {
  28         volatile struct _dispatch_once_waiter_s *volatile dow_next;
  29         dispatch_thread_event_s dow_event;
  30         mach_port_t dow_thread;
  31 } *_dispatch_once_waiter_t;
  32
  33 #define DISPATCH_ONCE_DONE ((_dispatch_once_waiter_t)~0l)
  34
  35 #ifdef __BLOCKS__
  36 void
  37 dispatch_once(dispatch_once_t *val, dispatch_block_t block)
  38 {
  39         dispatch_once_f(val, block, _dispatch_Block_invoke(block));
  40 }
  41 #endif
  42
  43 DISPATCH_NOINLINE
  44 void
  45 dispatch_once_f(dispatch_once_t *val, void *ctxt, dispatch_function_t func)
  46 {
  47 #if DISPATCH_GATE_USE_FOR_DISPATCH_ONCE
  48         dispatch_once_gate_t l = (dispatch_once_gate_t)val;
  49
  50         if (_dispatch_once_gate_tryenter(l)) {
  51                 _dispatch_client_callout(ctxt, func);
  52                 _dispatch_once_gate_broadcast(l);
  53         } else {
  54                 _dispatch_once_gate_wait(l);
  55         }
  56 #else
  57         _dispatch_once_waiter_t volatile *vval = (_dispatch_once_waiter_t*)val;
  58         struct _dispatch_once_waiter_s dow = { };
  59         _dispatch_once_waiter_t tail = &dow, next, tmp;
  60         dispatch_thread_event_t event;
  61
  62         if (os_atomic_cmpxchg(vval, NULL, tail, acquire)) {
  63                 dow.dow_thread = _dispatch_tid_self();
  64                 _dispatch_client_callout(ctxt, func);
  65
  66                 // The next barrier must be long and strong.
  67                 //
  68                 // The scenario: SMP systems with weakly ordered memory models
  69                 // and aggressive out-of-order instruction execution.
  70                 //
  71                 // The problem:
  72                 //
  73                 // The dispatch_once*() wrapper macro causes the callee's
  74                 // instruction stream to look like this (pseudo-RISC):
  75                 //
  76                 //      load r5, pred-addr
  77                 //      cmpi r5, -1
  78                 //      beq  1f
  79                 //      call dispatch_once*()
  80                 //      1f:
  81                 //      load r6, data-addr
  82                 //
  83                 // May be re-ordered like so:
  84                 //
  85                 //      load r6, data-addr
  86                 //      load r5, pred-addr
  87                 //      cmpi r5, -1
  88                 //      beq  1f
  89                 //      call dispatch_once*()
  90                 //      1f:
  91                 //
  92                 // Normally, a barrier on the read side is used to workaround
  93                 // the weakly ordered memory model. But barriers are expensive
  94                 // and we only need to synchronize once! After func(ctxt)
  95                 // completes, the predicate will be marked as "done" and the
  96                 // branch predictor will correctly skip the call to
  97                 // dispatch_once*().
  98                 //
  99                 // A far faster alternative solution: Defeat the speculative
 100                 // read-ahead of peer CPUs.
 101                 //
 102                 // Modern architectures will throw away speculative results
 103                 // once a branch mis-prediction occurs. Therefore, if we can
 104                 // ensure that the predicate is not marked as being complete
 105                 // until long after the last store by func(ctxt), then we have
 106                 // defeated the read-ahead of peer CPUs.
 107                 //
 108                 // In other words, the last "store" by func(ctxt) must complete
 109                 // and then N cycles must elapse before ~0l is stored to *val.
 110                 // The value of N is whatever is sufficient to defeat the
 111                 // read-ahead mechanism of peer CPUs.
 112                 //
 113                 // On some CPUs, the most fully synchronizing instruction might
 114                 // need to be issued.
 115
 116                 os_atomic_maximally_synchronizing_barrier();
 117                 // above assumed to contain release barrier
 118                 next = os_atomic_xchg(vval, DISPATCH_ONCE_DONE, relaxed);
 119                 while (next != tail) {
 120                         _dispatch_wait_until(tmp = (_dispatch_once_waiter_t)next->dow_next);
 121                         event = &next->dow_event;
 122                         next = tmp;
 123                         _dispatch_thread_event_signal(event);
 124                 }
 125         } else {
 126                 _dispatch_thread_event_init(&dow.dow_event);
 127                 next = *vval;
 128                 for (;;) {
 129                         if (next == DISPATCH_ONCE_DONE) {
 130                                 break;
 131                         }
 132                         if (os_atomic_cmpxchgvw(vval, next, tail, &next, release)) {
 133                                 dow.dow_thread = next->dow_thread;
 134                                 dow.dow_next = next;
 135                                 if (dow.dow_thread) {
 136                                         pthread_priority_t pp = _dispatch_get_priority();
 137                                         _dispatch_thread_override_start(dow.dow_thread, pp, val);
 138                                 }
 139                                 _dispatch_thread_event_wait(&dow.dow_event);
 140                                 if (dow.dow_thread) {
 141                                         _dispatch_thread_override_end(dow.dow_thread, val);
 142                                 }
 143                                 break;
 144                         }
 145                 }
 146                 _dispatch_thread_event_destroy(&dow.dow_event);
 147         }
 148 #endif
 149 }