]> git.saurik.com Git - apple/libc.git/blame - stdlib/psort.c.patch
Libc-583.tar.gz
[apple/libc.git] / stdlib / psort.c.patch
CommitLineData
34e8f829
A
1--- psort.c.orig 2008-11-24 17:01:07.000000000 -0800
2+++ psort.c 2008-11-24 22:02:57.000000000 -0800
3@@ -1,3 +1,4 @@
4+/****************************************************************************/
5 /*-
6 * Copyright (c) 1992, 1993
7 * The Regents of the University of California. All rights reserved.
8@@ -34,14 +35,22 @@ static char sccsid[] = "@(#)qsort.c 8.1
9 __FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $");
10
11 #include <stdlib.h>
12+#include <pthread.h>
13+#include <dispatch/dispatch.h>
14+#include <stddef.h>
15 #include <string.h>
16+#include <libkern/OSAtomic.h>
17+#include <sys/mman.h>
18+#include <errno.h>
19+#define __APPLE_API_PRIVATE
20+#include <machine/cpu_capabilities.h>
21
22-#ifdef I_AM_QSORT_R
23+#ifdef I_AM_PSORT_R
24 typedef int cmp_t(void *, const void *, const void *);
25 #else
26 typedef int cmp_t(const void *, const void *);
27 #endif
28-#ifdef I_AM_QSORT_B
29+#ifdef I_AM_PSORT_B
30 static inline char *med3(char *, char *, char *, cmp_t ^, void *) __attribute__((always_inline));
31 #else
32 static inline char *med3(char *, char *, char *, cmp_t *, void *) __attribute__((always_inline));
33@@ -50,6 +59,83 @@ static inline void swapfunc(char *, cha
34
35 #define min(a, b) (a) < (b) ? a : b
36
37+#define NARGS ((PAGESIZE - offsetof(struct page, args)) / sizeof(union args))
38+#define PAGESIZE 4096
39+#define PARALLEL_MIN_SIZE 2000 /* determine heuristically */
40+
41+struct shared; /* forward reference */
42+union args {
43+ union args *next;
44+ struct {
45+ struct shared *shared;
46+ void *a;
47+ size_t n;
48+ int depth_limit;
49+ } /* anonymous */;
50+};
51+
52+struct page {
53+ struct page *next;
54+ union args args[0];
55+};
56+
57+struct shared {
58+ char *who;
59+ union args *freelist;
60+ struct page *pagelist;
61+#ifdef I_AM_PSORT_R
62+ void *thunk;
63+#endif
64+#ifdef I_AM_PSORT_B
65+ cmp_t ^cmp;
66+#else
67+ cmp_t *cmp;
68+#endif
69+ size_t es;
70+ size_t turnoff;
71+ dispatch_queue_t queue;
72+ pthread_cond_t cond;
73+ pthread_mutex_t mutex;
74+ OSSpinLock sharedlock;
75+ int count;
76+};
77+
78+static union args *
79+getargs(struct shared *shared)
80+{
81+ union args *args;
82+
83+ OSSpinLockLock(&shared->sharedlock);
84+ if(!shared->freelist) {
85+ struct page *page;
86+ union args *prev;
87+ int i;
88+ if((page = (struct page *)mmap(NULL, PAGESIZE, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0)) == NULL)
89+ return NULL;
90+ page->next = shared->pagelist;
91+ shared->pagelist = page;
92+ prev = NULL;
93+ for(args = page->args, i = NARGS; i > 0; args++, i--) {
94+ args->next = prev;
95+ prev = args;
96+ }
97+ shared->freelist = prev;
98+ }
99+ args = shared->freelist;
100+ shared->freelist = args->next;
101+ OSSpinLockUnlock(&shared->sharedlock);
102+ return args;
103+}
104+
105+static void
106+returnargs(struct shared *shared, union args *args)
107+{
108+ OSSpinLockLock(&shared->sharedlock);
109+ args->next = shared->freelist;
110+ shared->freelist = args;
111+ OSSpinLockUnlock(&shared->sharedlock);
112+}
113+
114 /*
115 * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
116 */
117@@ -88,7 +174,7 @@ swapfunc(a, b, n, swaptype)
118
119 #define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype)
120
121-#ifdef I_AM_QSORT_R
122+#ifdef I_AM_PSORT_R
123 #define CMP(t, x, y) (cmp((t), (x), (y)))
124 #else
125 #define CMP(t, x, y) (cmp((x), (y)))
126@@ -96,13 +182,13 @@ swapfunc(a, b, n, swaptype)
127
128 static inline char *
129 med3(char *a, char *b, char *c,
130-#ifdef I_AM_QSORT_B
131+#ifdef I_AM_PSORT_B
132 cmp_t ^cmp,
133 #else
134 cmp_t *cmp,
135 #endif
136 void *thunk
137-#ifndef I_AM_QSORT_R
138+#ifndef I_AM_PSORT_R
139 __unused
140 #endif
141 )
142@@ -118,23 +204,25 @@ __unused
143 #define DEPTH(x) (2 * (fls((int)(x)) - 1))
144 #endif /* __LP64__ */
145
146-#ifdef I_AM_QSORT_R
147+#ifdef I_AM_PSORT_R
148 int __heapsort_r(void *, size_t, size_t, void *, int (*)(void *, const void *, const void *));
149 #endif
150
151+static void _psort_parallel(void *x);
152+
153 static void
154-_qsort(void *a, size_t n, size_t es,
155-#ifdef I_AM_QSORT_R
156+_psort(void *a, size_t n, size_t es,
157+#ifdef I_AM_PSORT_R
158 void *thunk,
159 #else
160 #define thunk NULL
161 #endif
162-#ifdef I_AM_QSORT_B
163+#ifdef I_AM_PSORT_B
164 cmp_t ^cmp,
165 #else
166 cmp_t *cmp,
167 #endif
168-int depth_limit)
169+int depth_limit, struct shared *shared)
170 {
171 char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
172 size_t d, r;
173@@ -143,9 +231,9 @@ int depth_limit)
174
175 loop:
176 if (depth_limit-- <= 0) {
177-#ifdef I_AM_QSORT_B
178+#ifdef I_AM_PSORT_B
179 heapsort_b(a, n, es, cmp);
180-#elif defined(I_AM_QSORT_R)
181+#elif defined(I_AM_PSORT_R)
182 __heapsort_r(a, n, es, thunk, cmp);
183 #else
184 heapsort(a, n, es, cmp);
185@@ -222,33 +310,135 @@ loop:
186 }
187
188 nevermind:
189- if ((r = pb - pa) > es)
190-#ifdef I_AM_QSORT_R
191- _qsort(a, r / es, es, thunk, cmp, depth_limit);
192+ if ((r = pb - pa) > es) {
193+ r /= es;
194+ if (shared && r > shared->turnoff) {
195+ union args *args = getargs(shared);
196+
197+ if (args == NULL)
198+ LIBC_ABORT("%s: getargs: %s", shared->who, strerror(errno));
199+ args->shared = shared;
200+ args->a = a;
201+ args->n = r;
202+ args->depth_limit = depth_limit;
203+ OSAtomicIncrement32(&shared->count);
204+ dispatch_async_f(shared->queue, args, _psort_parallel);
205+ } else {
206+#ifdef I_AM_PSORT_R
207+ _psort(a, r, es, thunk, cmp, depth_limit, NULL);
208 #else
209- _qsort(a, r / es, es, cmp, depth_limit);
210+ _psort(a, r, es, cmp, depth_limit, NULL);
211 #endif
212+ }
213+ }
214 if ((r = pd - pc) > es) {
215 /* Iterate rather than recurse to save stack space */
216 a = pn - r;
217 n = r / es;
218 goto loop;
219 }
220-/* qsort(pn - r, r / es, es, cmp);*/
221+/* psort(pn - r, r / es, es, cmp);*/
222+}
223+
224+static void
225+_psort_parallel(void *x)
226+{
227+ union args *args = (union args *)x;
228+ struct shared *shared = args->shared;
229+
230+ _psort(args->a, args->n, shared->es,
231+#ifdef I_AM_PSORT_R
232+ shared->thunk,
233+#endif
234+ shared->cmp, args->depth_limit, shared);
235+ returnargs(shared, args);
236+ if(OSAtomicDecrement32(&shared->count) <= 0) {
237+ pthread_mutex_lock(&shared->mutex);
238+ pthread_cond_signal(&shared->cond);
239+ pthread_mutex_unlock(&shared->mutex);
240+ }
241+}
242+
243+/* fast, approximate integer square root */
244+static size_t
245+isqrt(size_t x)
246+{
247+ size_t s = 1L << (flsl(x) / 2);
248+ return (s + x / s) / 2;
249 }
250
251 void
252-#ifdef I_AM_QSORT_R
253-qsort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp)
254-#elif defined(I_AM_QSORT_B)
255-qsort_b(void *a, size_t n, size_t es, cmp_t ^cmp)
256+#ifdef I_AM_PSORT_R
257+psort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp)
258+#elif defined(I_AM_PSORT_B)
259+psort_b(void *a, size_t n, size_t es, cmp_t ^cmp)
260 #else
261-qsort(void *a, size_t n, size_t es, cmp_t *cmp)
262+psort(void *a, size_t n, size_t es, cmp_t *cmp)
263 #endif
264 {
265- _qsort(a, n, es,
266-#ifdef I_AM_QSORT_R
267- thunk,
268+ if (n >= PARALLEL_MIN_SIZE && _NumCPUs() > 1) {
269+ struct shared shared;
270+ union args *args;
271+
272+ bzero(&shared, sizeof(shared));
273+ shared.sharedlock = OS_SPINLOCK_INIT;
274+ if ((args = getargs(&shared)) != NULL) {
275+ struct page *p, *pp;
276+#ifdef I_AM_PSORT_R
277+ shared.who = "psort_r";
278+ shared.thunk = thunk;
279+#elif defined(I_AM_PSORT_B)
280+ shared.who = "psort_b";
281+#else
282+ shared.who = "psort";
283+#endif
284+ shared.cmp = cmp;
285+ shared.es = es;
286+ shared.queue = dispatch_get_concurrent_queue(0);
287+ shared.cond = (pthread_cond_t)PTHREAD_COND_INITIALIZER;
288+ shared.mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
289+ args->a = a;
290+ args->n = n;
291+ args->depth_limit = DEPTH(n);
292+ args->shared = &shared;
293+ /*
294+ * The turnoff value is the size of a partition that,
295+ * below which, we stop doing in parallel, and just do
296+ * in the current thread. The value of sqrt(n) was
297+ * determined heuristically. There is a smaller
298+ * dependence on the slowness of the comparison
299+ * function, and there might be a dependence on the
300+ * number of processors, but the algorithm has not been
301+ * determined. Because the sensitivity to the turnoff
302+ * value is relatively low, we use a fast, approximate
303+ * integer square root routine that is good enough for
304+ * this purpose.
305+ */
306+ shared.turnoff = isqrt(n);
307+ OSAtomicIncrement32(&shared.count);
308+ _psort_parallel(args);
309+
310+ /* wait for queue to drain */
311+ pthread_mutex_lock(&shared.mutex);
312+ while(shared.count > 0)
313+ pthread_cond_wait(&shared.cond, &shared.mutex);
314+
315+ pthread_mutex_unlock(&shared.mutex);
316+ pthread_mutex_destroy(&shared.mutex);
317+ pthread_cond_destroy(&shared.cond);
318+ for(p = shared.pagelist; p; p = pp) {
319+ pp = p->next;
320+ munmap(p, PAGESIZE);
321+ }
322+ return;
323+ }
324+ }
325+ /* Just call qsort */
326+#ifdef I_AM_PSORT_R
327+ qsort_r(a, n, es, thunk, cmp);
328+#elif defined(I_AM_PSORT_B)
329+ qsort_b(a, n, es, cmp);
330+#else
331+ qsort(a, n, es, cmp);
332 #endif
333- cmp, DEPTH(n));
334 }