1 --- psort.c.orig 2008-11-24 17:01:07.000000000 -0800
2 +++ psort.c 2008-11-24 22:02:57.000000000 -0800
4 +/****************************************************************************/
6 * Copyright (c) 1992, 1993
7 * The Regents of the University of California. All rights reserved.
8 @@ -34,14 +35,22 @@ static char sccsid[] = "@(#)qsort.c 8.1
9 __FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $");
13 +#include <dispatch/dispatch.h>
16 +#include <libkern/OSAtomic.h>
17 +#include <sys/mman.h>
19 +#define __APPLE_API_PRIVATE
20 +#include <machine/cpu_capabilities.h>
24 typedef int cmp_t(void *, const void *, const void *);
26 typedef int cmp_t(const void *, const void *);
30 static inline char *med3(char *, char *, char *, cmp_t ^, void *) __attribute__((always_inline));
32 static inline char *med3(char *, char *, char *, cmp_t *, void *) __attribute__((always_inline));
33 @@ -50,6 +59,83 @@ static inline void swapfunc(char *, cha
35 #define min(a, b) (a) < (b) ? a : b
37 +#define NARGS ((PAGESIZE - offsetof(struct page, args)) / sizeof(union args))
38 +#define PAGESIZE 4096
39 +#define PARALLEL_MIN_SIZE 2000 /* determine heuristically */
41 +struct shared; /* forward reference */
45 + struct shared *shared;
59 + union args *freelist;
60 + struct page *pagelist;
71 + dispatch_queue_t queue;
72 + pthread_cond_t cond;
73 + pthread_mutex_t mutex;
74 + OSSpinLock sharedlock;
79 +getargs(struct shared *shared)
83 + OSSpinLockLock(&shared->sharedlock);
84 + if(!shared->freelist) {
88 + if((page = (struct page *)mmap(NULL, PAGESIZE, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0)) == NULL)
90 + page->next = shared->pagelist;
91 + shared->pagelist = page;
93 + for(args = page->args, i = NARGS; i > 0; args++, i--) {
97 + shared->freelist = prev;
99 + args = shared->freelist;
100 + shared->freelist = args->next;
101 + OSSpinLockUnlock(&shared->sharedlock);
106 +returnargs(struct shared *shared, union args *args)
108 + OSSpinLockLock(&shared->sharedlock);
109 + args->next = shared->freelist;
110 + shared->freelist = args;
111 + OSSpinLockUnlock(&shared->sharedlock);
115 * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
117 @@ -88,7 +174,7 @@ swapfunc(a, b, n, swaptype)
119 #define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype)
123 #define CMP(t, x, y) (cmp((t), (x), (y)))
125 #define CMP(t, x, y) (cmp((x), (y)))
126 @@ -96,13 +182,13 @@ swapfunc(a, b, n, swaptype)
129 med3(char *a, char *b, char *c,
137 -#ifndef I_AM_QSORT_R
138 +#ifndef I_AM_PSORT_R
142 @@ -118,23 +204,25 @@ __unused
143 #define DEPTH(x) (2 * (fls((int)(x)) - 1))
144 #endif /* __LP64__ */
148 int __heapsort_r(void *, size_t, size_t, void *, int (*)(void *, const void *, const void *));
151 +static void _psort_parallel(void *x);
154 -_qsort(void *a, size_t n, size_t es,
156 +_psort(void *a, size_t n, size_t es,
169 +int depth_limit, struct shared *shared)
171 char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
173 @@ -143,9 +231,9 @@ int depth_limit)
176 if (depth_limit-- <= 0) {
179 heapsort_b(a, n, es, cmp);
180 -#elif defined(I_AM_QSORT_R)
181 +#elif defined(I_AM_PSORT_R)
182 __heapsort_r(a, n, es, thunk, cmp);
184 heapsort(a, n, es, cmp);
185 @@ -222,33 +310,135 @@ loop:
189 - if ((r = pb - pa) > es)
191 - _qsort(a, r / es, es, thunk, cmp, depth_limit);
192 + if ((r = pb - pa) > es) {
194 + if (shared && r > shared->turnoff) {
195 + union args *args = getargs(shared);
198 + LIBC_ABORT("%s: getargs: %s", shared->who, strerror(errno));
199 + args->shared = shared;
202 + args->depth_limit = depth_limit;
203 + OSAtomicIncrement32(&shared->count);
204 + dispatch_async_f(shared->queue, args, _psort_parallel);
207 + _psort(a, r, es, thunk, cmp, depth_limit, NULL);
209 - _qsort(a, r / es, es, cmp, depth_limit);
210 + _psort(a, r, es, cmp, depth_limit, NULL);
214 if ((r = pd - pc) > es) {
215 /* Iterate rather than recurse to save stack space */
220 -/* qsort(pn - r, r / es, es, cmp);*/
221 +/* psort(pn - r, r / es, es, cmp);*/
225 +_psort_parallel(void *x)
227 + union args *args = (union args *)x;
228 + struct shared *shared = args->shared;
230 + _psort(args->a, args->n, shared->es,
234 + shared->cmp, args->depth_limit, shared);
235 + returnargs(shared, args);
236 + if(OSAtomicDecrement32(&shared->count) <= 0) {
237 + pthread_mutex_lock(&shared->mutex);
238 + pthread_cond_signal(&shared->cond);
239 + pthread_mutex_unlock(&shared->mutex);
243 +/* fast, approximate integer square root */
247 + size_t s = 1L << (flsl(x) / 2);
248 + return (s + x / s) / 2;
253 -qsort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp)
254 -#elif defined(I_AM_QSORT_B)
255 -qsort_b(void *a, size_t n, size_t es, cmp_t ^cmp)
257 +psort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp)
258 +#elif defined(I_AM_PSORT_B)
259 +psort_b(void *a, size_t n, size_t es, cmp_t ^cmp)
261 -qsort(void *a, size_t n, size_t es, cmp_t *cmp)
262 +psort(void *a, size_t n, size_t es, cmp_t *cmp)
268 + if (n >= PARALLEL_MIN_SIZE && _NumCPUs() > 1) {
269 + struct shared shared;
272 + bzero(&shared, sizeof(shared));
273 + shared.sharedlock = OS_SPINLOCK_INIT;
274 + if ((args = getargs(&shared)) != NULL) {
275 + struct page *p, *pp;
277 + shared.who = "psort_r";
278 + shared.thunk = thunk;
279 +#elif defined(I_AM_PSORT_B)
280 + shared.who = "psort_b";
282 + shared.who = "psort";
286 + shared.queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
287 + shared.cond = (pthread_cond_t)PTHREAD_COND_INITIALIZER;
288 + shared.mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
291 + args->depth_limit = DEPTH(n);
292 + args->shared = &shared;
294 + * The turnoff value is the size of a partition that,
295 + * below which, we stop doing in parallel, and just do
296 + * in the current thread. The value of sqrt(n) was
297 + * determined heuristically. There is a smaller
298 + * dependence on the slowness of the comparison
299 + * function, and there might be a dependence on the
300 + * number of processors, but the algorithm has not been
301 + * determined. Because the sensitivity to the turnoff
302 + * value is relatively low, we use a fast, approximate
303 + * integer square root routine that is good enough for
306 + shared.turnoff = isqrt(n);
307 + OSAtomicIncrement32(&shared.count);
308 + _psort_parallel(args);
310 + /* wait for queue to drain */
311 + pthread_mutex_lock(&shared.mutex);
312 + while(shared.count > 0)
313 + pthread_cond_wait(&shared.cond, &shared.mutex);
315 + pthread_mutex_unlock(&shared.mutex);
316 + pthread_mutex_destroy(&shared.mutex);
317 + pthread_cond_destroy(&shared.cond);
318 + for(p = shared.pagelist; p; p = pp) {
320 + munmap(p, PAGESIZE);
325 + /* Just call qsort */
327 + qsort_r(a, n, es, thunk, cmp);
328 +#elif defined(I_AM_PSORT_B)
329 + qsort_b(a, n, es, cmp);
331 + qsort(a, n, es, cmp);