src/once.c

   1 /*
   2  * Copyright (c) 2008-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_APACHE_LICENSE_HEADER_START@
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  *
  18  * @APPLE_APACHE_LICENSE_HEADER_END@
  19  */
  20
  21 #include "internal.h"
  22
  23 #undef dispatch_once
  24 #undef dispatch_once_f
  25
  26 #ifdef __BLOCKS__
  27 void
  28 dispatch_once(dispatch_once_t *val, void (^block)(void))
  29 {
  30         struct Block_basic *bb = (void *)block;
  31
  32         dispatch_once_f(val, block, (void *)bb->Block_invoke);
  33 }
  34 #endif
  35
  36 DISPATCH_NOINLINE
  37 void
  38 dispatch_once_f(dispatch_once_t *val, void *ctxt, void (*func)(void *))
  39 {
  40         volatile long *vval = val;
  41
  42         if (dispatch_atomic_cmpxchg(val, 0l, 1l)) {
  43                 func(ctxt);
  44
  45                 // The next barrier must be long and strong.
  46                 //
  47                 // The scenario: SMP systems with weakly ordered memory models
  48                 // and aggressive out-of-order instruction execution.
  49                 //
  50                 // The problem:
  51                 //
  52                 // The dispatch_once*() wrapper macro causes the callee's
  53                 // instruction stream to look like this (pseudo-RISC):
  54                 //
  55                 //      load r5, pred-addr
  56                 //      cmpi r5, -1
  57                 //      beq  1f
  58                 //      call dispatch_once*()
  59                 // 1f:
  60                 //      load r6, data-addr
  61                 //
  62                 // May be re-ordered like so:
  63                 //
  64                 //      load r6, data-addr
  65                 //      load r5, pred-addr
  66                 //      cmpi r5, -1
  67                 //      beq  1f
  68                 //      call dispatch_once*()
  69                 // 1f:
  70                 //
  71                 // Normally, a barrier on the read side is used to workaround
  72                 // the weakly ordered memory model. But barriers are expensive
  73                 // and we only need to synchronize once!  After func(ctxt)
  74                 // completes, the predicate will be marked as "done" and the
  75                 // branch predictor will correctly skip the call to
  76                 // dispatch_once*().
  77                 //
  78                 // A far faster alternative solution: Defeat the speculative
  79                 // read-ahead of peer CPUs.
  80                 //
  81                 // Modern architectures will throw away speculative results
  82                 // once a branch mis-prediction occurs. Therefore, if we can
  83                 // ensure that the predicate is not marked as being complete
  84                 // until long after the last store by func(ctxt), then we have
  85                 // defeated the read-ahead of peer CPUs.
  86                 //
  87                 // In other words, the last "store" by func(ctxt) must complete
  88                 // and then N cycles must elapse before ~0l is stored to *val.
  89                 // The value of N is whatever is sufficient to defeat the
  90                 // read-ahead mechanism of peer CPUs.
  91                 //
  92                 // On some CPUs, the most fully synchronizing instruction might
  93                 // need to be issued.
  94
  95                 dispatch_atomic_barrier();
  96                 *val = ~0l;
  97         } else {
  98                 do {
  99                         _dispatch_hardware_pause();
 100                 } while (*vval != ~0l);
 101
 102                 dispatch_atomic_barrier();
 103         }
 104 }