src/apply.c

   1 /*
   2  * Copyright (c) 2008-2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_APACHE_LICENSE_HEADER_START@
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  *
  18  * @APPLE_APACHE_LICENSE_HEADER_END@
  19  */
  20 #include "internal.h"
  21
  22 // We'd use __attribute__((aligned(x))), but it does not atually increase the
  23 // alignment of stack variables. All we really need is the stack usage of the
  24 // local thread to be sufficiently away to avoid cache-line contention with the
  25 // busy 'da_index' variable.
  26 //
  27 // NOTE: 'char' arrays cause GCC to insert buffer overflow detection logic
  28 struct dispatch_apply_s {
  29         long _da_pad0[DISPATCH_CACHELINE_SIZE / sizeof(long)];
  30         void (*da_func)(void *, size_t);
  31         void *da_ctxt;
  32         size_t da_iterations;
  33         size_t da_index;
  34         uint32_t da_thr_cnt;
  35         _dispatch_thread_semaphore_t da_sema;
  36         dispatch_queue_t da_queue;
  37         long _da_pad1[DISPATCH_CACHELINE_SIZE / sizeof(long)];
  38 };
  39
  40 DISPATCH_ALWAYS_INLINE
  41 static inline void
  42 _dispatch_apply_invoke(void *ctxt)
  43 {
  44         struct dispatch_apply_s *da = ctxt;
  45         size_t const iter = da->da_iterations;
  46         typeof(da->da_func) const func = da->da_func;
  47         void *const da_ctxt = da->da_ctxt;
  48         size_t idx;
  49
  50         _dispatch_workitem_dec(); // this unit executes many items
  51
  52         // Make nested dispatch_apply fall into serial case rdar://problem/9294578
  53         _dispatch_thread_setspecific(dispatch_apply_key, (void*)~0ul);
  54         // Striding is the responsibility of the caller.
  55         while (fastpath((idx = dispatch_atomic_inc2o(da, da_index) - 1) < iter)) {
  56                 _dispatch_client_callout2(da_ctxt, idx, func);
  57                 _dispatch_workitem_inc();
  58         }
  59         _dispatch_thread_setspecific(dispatch_apply_key, NULL);
  60
  61         dispatch_atomic_release_barrier();
  62         if (dispatch_atomic_dec2o(da, da_thr_cnt) == 0) {
  63                 _dispatch_thread_semaphore_signal(da->da_sema);
  64         }
  65 }
  66
  67 DISPATCH_NOINLINE
  68 static void
  69 _dispatch_apply2(void *ctxt)
  70 {
  71         _dispatch_apply_invoke(ctxt);
  72 }
  73
  74 static void
  75 _dispatch_apply3(void *ctxt)
  76 {
  77         struct dispatch_apply_s *da = ctxt;
  78         dispatch_queue_t old_dq = _dispatch_thread_getspecific(dispatch_queue_key);
  79
  80         _dispatch_thread_setspecific(dispatch_queue_key, da->da_queue);
  81         _dispatch_apply_invoke(ctxt);
  82         _dispatch_thread_setspecific(dispatch_queue_key, old_dq);
  83 }
  84
  85 static void
  86 _dispatch_apply_serial(void *ctxt)
  87 {
  88         struct dispatch_apply_s *da = ctxt;
  89         size_t idx = 0;
  90
  91         _dispatch_workitem_dec(); // this unit executes many items
  92         do {
  93                 _dispatch_client_callout2(da->da_ctxt, idx, da->da_func);
  94                 _dispatch_workitem_inc();
  95         } while (++idx < da->da_iterations);
  96 }
  97
  98 // 256 threads should be good enough for the short to mid term
  99 #define DISPATCH_APPLY_MAX_CPUS 256
 100
 101 DISPATCH_ALWAYS_INLINE
 102 static inline void
 103 _dispatch_apply_f2(dispatch_queue_t dq, struct dispatch_apply_s *da,
 104                 dispatch_function_t func)
 105 {
 106         struct dispatch_apply_dc_s {
 107                 DISPATCH_CONTINUATION_HEADER(dispatch_apply_dc_s);
 108         } da_dc[DISPATCH_APPLY_MAX_CPUS];
 109         size_t i;
 110
 111         for (i = 0; i < da->da_thr_cnt - 1; i++) {
 112                 da_dc[i].do_vtable = NULL;
 113                 da_dc[i].do_next = &da_dc[i + 1];
 114                 da_dc[i].dc_func = func;
 115                 da_dc[i].dc_ctxt = da;
 116         }
 117
 118         da->da_sema = _dispatch_get_thread_semaphore();
 119
 120         _dispatch_queue_push_list(dq, (void *)&da_dc[0],
 121                         (void *)&da_dc[da->da_thr_cnt - 2]);
 122         // Call the first element directly
 123         _dispatch_apply2(da);
 124         _dispatch_workitem_inc();
 125
 126         _dispatch_thread_semaphore_wait(da->da_sema);
 127         _dispatch_put_thread_semaphore(da->da_sema);
 128 }
 129
 130 static void
 131 _dispatch_apply_redirect(void *ctxt)
 132 {
 133         struct dispatch_apply_s *da = ctxt;
 134         uint32_t da_width = 2 * (da->da_thr_cnt - 1);
 135         dispatch_queue_t dq = da->da_queue, rq = dq, tq;
 136
 137         do {
 138                 uint32_t running = dispatch_atomic_add2o(rq, dq_running, da_width);
 139                 uint32_t width = rq->dq_width;
 140                 if (slowpath(running > width)) {
 141                         uint32_t excess = width > 1 ? running - width : da_width;
 142                         for (tq = dq; 1; tq = tq->do_targetq) {
 143                                 (void)dispatch_atomic_sub2o(tq, dq_running, excess);
 144                                 if (tq == rq) {
 145                                         break;
 146                                 }
 147                         }
 148                         da_width -= excess;
 149                         if (slowpath(!da_width)) {
 150                                 return _dispatch_apply_serial(da);
 151                         }
 152                         da->da_thr_cnt -= excess / 2;
 153                 }
 154                 rq = rq->do_targetq;
 155         } while (slowpath(rq->do_targetq));
 156         _dispatch_apply_f2(rq, da, _dispatch_apply3);
 157         do {
 158                 (void)dispatch_atomic_sub2o(dq, dq_running, da_width);
 159                 dq = dq->do_targetq;
 160         } while (slowpath(dq->do_targetq));
 161 }
 162
 163 DISPATCH_NOINLINE
 164 void
 165 dispatch_apply_f(size_t iterations, dispatch_queue_t dq, void *ctxt,
 166                 void (*func)(void *, size_t))
 167 {
 168         struct dispatch_apply_s da;
 169
 170         da.da_func = func;
 171         da.da_ctxt = ctxt;
 172         da.da_iterations = iterations;
 173         da.da_index = 0;
 174         da.da_thr_cnt = _dispatch_hw_config.cc_max_active;
 175         da.da_queue = NULL;
 176
 177         if (da.da_thr_cnt > DISPATCH_APPLY_MAX_CPUS) {
 178                 da.da_thr_cnt = DISPATCH_APPLY_MAX_CPUS;
 179         }
 180         if (slowpath(iterations == 0)) {
 181                 return;
 182         }
 183         if (iterations < da.da_thr_cnt) {
 184                 da.da_thr_cnt = (uint32_t)iterations;
 185         }
 186         if (slowpath(dq->dq_width <= 2) || slowpath(da.da_thr_cnt <= 1) ||
 187                         slowpath(_dispatch_thread_getspecific(dispatch_apply_key))) {
 188                 return dispatch_sync_f(dq, &da, _dispatch_apply_serial);
 189         }
 190         dispatch_queue_t old_dq = _dispatch_thread_getspecific(dispatch_queue_key);
 191         if (slowpath(dq->do_targetq)) {
 192                 if (slowpath(dq == old_dq)) {
 193                         return dispatch_sync_f(dq, &da, _dispatch_apply_serial);
 194                 } else {
 195                         da.da_queue = dq;
 196                         return dispatch_sync_f(dq, &da, _dispatch_apply_redirect);
 197                 }
 198         }
 199         dispatch_atomic_acquire_barrier();
 200         _dispatch_thread_setspecific(dispatch_queue_key, dq);
 201         _dispatch_apply_f2(dq, &da, _dispatch_apply2);
 202         _dispatch_thread_setspecific(dispatch_queue_key, old_dq);
 203 }
 204
 205 #ifdef __BLOCKS__
 206 #if DISPATCH_COCOA_COMPAT
 207 DISPATCH_NOINLINE
 208 static void
 209 _dispatch_apply_slow(size_t iterations, dispatch_queue_t dq,
 210                 void (^work)(size_t))
 211 {
 212         struct Block_basic *bb = (void *)_dispatch_Block_copy((void *)work);
 213         dispatch_apply_f(iterations, dq, bb, (void *)bb->Block_invoke);
 214         Block_release(bb);
 215 }
 216 #endif
 217
 218 void
 219 dispatch_apply(size_t iterations, dispatch_queue_t dq, void (^work)(size_t))
 220 {
 221 #if DISPATCH_COCOA_COMPAT
 222         // Under GC, blocks transferred to other threads must be Block_copy()ed
 223         // rdar://problem/7455071
 224         if (dispatch_begin_thread_4GC) {
 225                 return _dispatch_apply_slow(iterations, dq, work);
 226         }
 227 #endif
 228         struct Block_basic *bb = (void *)work;
 229         dispatch_apply_f(iterations, dq, bb, (void *)bb->Block_invoke);
 230 }
 231 #endif
 232
 233 #if 0
 234 #ifdef __BLOCKS__
 235 void
 236 dispatch_stride(size_t offset, size_t stride, size_t iterations,
 237                 dispatch_queue_t dq, void (^work)(size_t))
 238 {
 239         struct Block_basic *bb = (void *)work;
 240         dispatch_stride_f(offset, stride, iterations, dq, bb,
 241                         (void *)bb->Block_invoke);
 242 }
 243 #endif
 244
 245 DISPATCH_NOINLINE
 246 void
 247 dispatch_stride_f(size_t offset, size_t stride, size_t iterations,
 248                 dispatch_queue_t dq, void *ctxt, void (*func)(void *, size_t))
 249 {
 250         if (stride == 0) {
 251                 stride = 1;
 252         }
 253         dispatch_apply(iterations / stride, queue, ^(size_t idx) {
 254                 size_t i = idx * stride + offset;
 255                 size_t stop = i + stride;
 256                 do {
 257                         func(ctxt, i++);
 258                 } while (i < stop);
 259         });
 260
 261         dispatch_sync(queue, ^{
 262                 size_t i;
 263                 for (i = iterations - (iterations % stride); i < iterations; i++) {
 264                         func(ctxt, i + offset);
 265                 }
 266         });
 267 }
 268 #endif