src/once.c

   1 /*
   2  * Copyright (c) 2008-2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_APACHE_LICENSE_HEADER_START@
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  *
  18  * @APPLE_APACHE_LICENSE_HEADER_END@
  19  */
  20
  21 #include "internal.h"
  22
  23 #undef dispatch_once
  24 #undef dispatch_once_f
  25
  26
  27 struct _dispatch_once_waiter_s {
  28         volatile struct _dispatch_once_waiter_s *volatile dow_next;
  29         _dispatch_thread_semaphore_t dow_sema;
  30 };
  31
  32 #define DISPATCH_ONCE_DONE ((struct _dispatch_once_waiter_s *)~0l)
  33
  34 #ifdef __BLOCKS__
  35 void
  36 dispatch_once(dispatch_once_t *val, dispatch_block_t block)
  37 {
  38         struct Block_basic *bb = (void *)block;
  39
  40         dispatch_once_f(val, block, (void *)bb->Block_invoke);
  41 }
  42 #endif
  43
  44 DISPATCH_NOINLINE
  45 void
  46 dispatch_once_f(dispatch_once_t *val, void *ctxt, dispatch_function_t func)
  47 {
  48         struct _dispatch_once_waiter_s * volatile *vval =
  49                         (struct _dispatch_once_waiter_s**)val;
  50         struct _dispatch_once_waiter_s dow = { NULL, 0 };
  51         struct _dispatch_once_waiter_s *tail, *tmp;
  52         _dispatch_thread_semaphore_t sema;
  53
  54         if (dispatch_atomic_cmpxchg(vval, NULL, &dow)) {
  55                 dispatch_atomic_acquire_barrier();
  56                 _dispatch_client_callout(ctxt, func);
  57
  58                 // The next barrier must be long and strong.
  59                 //
  60                 // The scenario: SMP systems with weakly ordered memory models
  61                 // and aggressive out-of-order instruction execution.
  62                 //
  63                 // The problem:
  64                 //
  65                 // The dispatch_once*() wrapper macro causes the callee's
  66                 // instruction stream to look like this (pseudo-RISC):
  67                 //
  68                 //      load r5, pred-addr
  69                 //      cmpi r5, -1
  70                 //      beq  1f
  71                 //      call dispatch_once*()
  72                 //      1f:
  73                 //      load r6, data-addr
  74                 //
  75                 // May be re-ordered like so:
  76                 //
  77                 //      load r6, data-addr
  78                 //      load r5, pred-addr
  79                 //      cmpi r5, -1
  80                 //      beq  1f
  81                 //      call dispatch_once*()
  82                 //      1f:
  83                 //
  84                 // Normally, a barrier on the read side is used to workaround
  85                 // the weakly ordered memory model. But barriers are expensive
  86                 // and we only need to synchronize once! After func(ctxt)
  87                 // completes, the predicate will be marked as "done" and the
  88                 // branch predictor will correctly skip the call to
  89                 // dispatch_once*().
  90                 //
  91                 // A far faster alternative solution: Defeat the speculative
  92                 // read-ahead of peer CPUs.
  93                 //
  94                 // Modern architectures will throw away speculative results
  95                 // once a branch mis-prediction occurs. Therefore, if we can
  96                 // ensure that the predicate is not marked as being complete
  97                 // until long after the last store by func(ctxt), then we have
  98                 // defeated the read-ahead of peer CPUs.
  99                 //
 100                 // In other words, the last "store" by func(ctxt) must complete
 101                 // and then N cycles must elapse before ~0l is stored to *val.
 102                 // The value of N is whatever is sufficient to defeat the
 103                 // read-ahead mechanism of peer CPUs.
 104                 //
 105                 // On some CPUs, the most fully synchronizing instruction might
 106                 // need to be issued.
 107
 108                 dispatch_atomic_maximally_synchronizing_barrier();
 109                 //dispatch_atomic_release_barrier(); // assumed contained in above
 110                 tmp = dispatch_atomic_xchg(vval, DISPATCH_ONCE_DONE);
 111                 tail = &dow;
 112                 while (tail != tmp) {
 113                         while (!tmp->dow_next) {
 114                                 _dispatch_hardware_pause();
 115                         }
 116                         sema = tmp->dow_sema;
 117                         tmp = (struct _dispatch_once_waiter_s*)tmp->dow_next;
 118                         _dispatch_thread_semaphore_signal(sema);
 119                 }
 120         } else {
 121                 dow.dow_sema = _dispatch_get_thread_semaphore();
 122                 for (;;) {
 123                         tmp = *vval;
 124                         if (tmp == DISPATCH_ONCE_DONE) {
 125                                 break;
 126                         }
 127                         dispatch_atomic_store_barrier();
 128                         if (dispatch_atomic_cmpxchg(vval, tmp, &dow)) {
 129                                 dow.dow_next = tmp;
 130                                 _dispatch_thread_semaphore_wait(dow.dow_sema);
 131                         }
 132                 }
 133                 _dispatch_put_thread_semaphore(dow.dow_sema);
 134         }
 135 }