]>
Commit | Line | Data |
---|---|---|
34e8f829 A |
1 | --- psort.c.orig 2008-11-24 17:01:07.000000000 -0800 |
2 | +++ psort.c 2008-11-24 22:02:57.000000000 -0800 | |
3 | @@ -1,3 +1,4 @@ | |
4 | +/****************************************************************************/ | |
5 | /*- | |
6 | * Copyright (c) 1992, 1993 | |
7 | * The Regents of the University of California. All rights reserved. | |
8 | @@ -34,14 +35,22 @@ static char sccsid[] = "@(#)qsort.c 8.1 | |
9 | __FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $"); | |
10 | ||
11 | #include <stdlib.h> | |
12 | +#include <pthread.h> | |
13 | +#include <dispatch/dispatch.h> | |
14 | +#include <stddef.h> | |
15 | #include <string.h> | |
16 | +#include <libkern/OSAtomic.h> | |
17 | +#include <sys/mman.h> | |
18 | +#include <errno.h> | |
19 | +#define __APPLE_API_PRIVATE | |
20 | +#include <machine/cpu_capabilities.h> | |
21 | ||
22 | -#ifdef I_AM_QSORT_R | |
23 | +#ifdef I_AM_PSORT_R | |
24 | typedef int cmp_t(void *, const void *, const void *); | |
25 | #else | |
26 | typedef int cmp_t(const void *, const void *); | |
27 | #endif | |
28 | -#ifdef I_AM_QSORT_B | |
29 | +#ifdef I_AM_PSORT_B | |
30 | static inline char *med3(char *, char *, char *, cmp_t ^, void *) __attribute__((always_inline)); | |
31 | #else | |
32 | static inline char *med3(char *, char *, char *, cmp_t *, void *) __attribute__((always_inline)); | |
33 | @@ -50,6 +59,83 @@ static inline void swapfunc(char *, cha | |
34 | ||
35 | #define min(a, b) (a) < (b) ? a : b | |
36 | ||
37 | +#define NARGS ((PAGESIZE - offsetof(struct page, args)) / sizeof(union args)) | |
38 | +#define PAGESIZE 4096 | |
39 | +#define PARALLEL_MIN_SIZE 2000 /* determine heuristically */ | |
40 | + | |
41 | +struct shared; /* forward reference */ | |
42 | +union args { | |
43 | + union args *next; | |
44 | + struct { | |
45 | + struct shared *shared; | |
46 | + void *a; | |
47 | + size_t n; | |
48 | + int depth_limit; | |
49 | + } /* anonymous */; | |
50 | +}; | |
51 | + | |
52 | +struct page { | |
53 | + struct page *next; | |
54 | + union args args[0]; | |
55 | +}; | |
56 | + | |
57 | +struct shared { | |
58 | + char *who; | |
59 | + union args *freelist; | |
60 | + struct page *pagelist; | |
61 | +#ifdef I_AM_PSORT_R | |
62 | + void *thunk; | |
63 | +#endif | |
64 | +#ifdef I_AM_PSORT_B | |
65 | + cmp_t ^cmp; | |
66 | +#else | |
67 | + cmp_t *cmp; | |
68 | +#endif | |
69 | + size_t es; | |
70 | + size_t turnoff; | |
71 | + dispatch_queue_t queue; | |
72 | + pthread_cond_t cond; | |
73 | + pthread_mutex_t mutex; | |
74 | + OSSpinLock sharedlock; | |
75 | + int count; | |
76 | +}; | |
77 | + | |
78 | +static union args * | |
79 | +getargs(struct shared *shared) | |
80 | +{ | |
81 | + union args *args; | |
82 | + | |
83 | + OSSpinLockLock(&shared->sharedlock); | |
84 | + if(!shared->freelist) { | |
85 | + struct page *page; | |
86 | + union args *prev; | |
87 | + int i; | |
88 | + if((page = (struct page *)mmap(NULL, PAGESIZE, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0)) == NULL) | |
89 | + return NULL; | |
90 | + page->next = shared->pagelist; | |
91 | + shared->pagelist = page; | |
92 | + prev = NULL; | |
93 | + for(args = page->args, i = NARGS; i > 0; args++, i--) { | |
94 | + args->next = prev; | |
95 | + prev = args; | |
96 | + } | |
97 | + shared->freelist = prev; | |
98 | + } | |
99 | + args = shared->freelist; | |
100 | + shared->freelist = args->next; | |
101 | + OSSpinLockUnlock(&shared->sharedlock); | |
102 | + return args; | |
103 | +} | |
104 | + | |
105 | +static void | |
106 | +returnargs(struct shared *shared, union args *args) | |
107 | +{ | |
108 | + OSSpinLockLock(&shared->sharedlock); | |
109 | + args->next = shared->freelist; | |
110 | + shared->freelist = args; | |
111 | + OSSpinLockUnlock(&shared->sharedlock); | |
112 | +} | |
113 | + | |
114 | /* | |
115 | * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". | |
116 | */ | |
117 | @@ -88,7 +174,7 @@ swapfunc(a, b, n, swaptype) | |
118 | ||
119 | #define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype) | |
120 | ||
121 | -#ifdef I_AM_QSORT_R | |
122 | +#ifdef I_AM_PSORT_R | |
123 | #define CMP(t, x, y) (cmp((t), (x), (y))) | |
124 | #else | |
125 | #define CMP(t, x, y) (cmp((x), (y))) | |
126 | @@ -96,13 +182,13 @@ swapfunc(a, b, n, swaptype) | |
127 | ||
128 | static inline char * | |
129 | med3(char *a, char *b, char *c, | |
130 | -#ifdef I_AM_QSORT_B | |
131 | +#ifdef I_AM_PSORT_B | |
132 | cmp_t ^cmp, | |
133 | #else | |
134 | cmp_t *cmp, | |
135 | #endif | |
136 | void *thunk | |
137 | -#ifndef I_AM_QSORT_R | |
138 | +#ifndef I_AM_PSORT_R | |
139 | __unused | |
140 | #endif | |
141 | ) | |
142 | @@ -118,23 +204,25 @@ __unused | |
143 | #define DEPTH(x) (2 * (fls((int)(x)) - 1)) | |
144 | #endif /* __LP64__ */ | |
145 | ||
146 | -#ifdef I_AM_QSORT_R | |
147 | +#ifdef I_AM_PSORT_R | |
148 | int __heapsort_r(void *, size_t, size_t, void *, int (*)(void *, const void *, const void *)); | |
149 | #endif | |
150 | ||
151 | +static void _psort_parallel(void *x); | |
152 | + | |
153 | static void | |
154 | -_qsort(void *a, size_t n, size_t es, | |
155 | -#ifdef I_AM_QSORT_R | |
156 | +_psort(void *a, size_t n, size_t es, | |
157 | +#ifdef I_AM_PSORT_R | |
158 | void *thunk, | |
159 | #else | |
160 | #define thunk NULL | |
161 | #endif | |
162 | -#ifdef I_AM_QSORT_B | |
163 | +#ifdef I_AM_PSORT_B | |
164 | cmp_t ^cmp, | |
165 | #else | |
166 | cmp_t *cmp, | |
167 | #endif | |
168 | -int depth_limit) | |
169 | +int depth_limit, struct shared *shared) | |
170 | { | |
171 | char *pa, *pb, *pc, *pd, *pl, *pm, *pn; | |
172 | size_t d, r; | |
173 | @@ -143,9 +231,9 @@ int depth_limit) | |
174 | ||
175 | loop: | |
176 | if (depth_limit-- <= 0) { | |
177 | -#ifdef I_AM_QSORT_B | |
178 | +#ifdef I_AM_PSORT_B | |
179 | heapsort_b(a, n, es, cmp); | |
180 | -#elif defined(I_AM_QSORT_R) | |
181 | +#elif defined(I_AM_PSORT_R) | |
182 | __heapsort_r(a, n, es, thunk, cmp); | |
183 | #else | |
184 | heapsort(a, n, es, cmp); | |
185 | @@ -222,33 +310,135 @@ loop: | |
186 | } | |
187 | ||
188 | nevermind: | |
189 | - if ((r = pb - pa) > es) | |
190 | -#ifdef I_AM_QSORT_R | |
191 | - _qsort(a, r / es, es, thunk, cmp, depth_limit); | |
192 | + if ((r = pb - pa) > es) { | |
193 | + r /= es; | |
194 | + if (shared && r > shared->turnoff) { | |
195 | + union args *args = getargs(shared); | |
196 | + | |
197 | + if (args == NULL) | |
198 | + LIBC_ABORT("%s: getargs: %s", shared->who, strerror(errno)); | |
199 | + args->shared = shared; | |
200 | + args->a = a; | |
201 | + args->n = r; | |
202 | + args->depth_limit = depth_limit; | |
203 | + OSAtomicIncrement32(&shared->count); | |
204 | + dispatch_async_f(shared->queue, args, _psort_parallel); | |
205 | + } else { | |
206 | +#ifdef I_AM_PSORT_R | |
207 | + _psort(a, r, es, thunk, cmp, depth_limit, NULL); | |
208 | #else | |
209 | - _qsort(a, r / es, es, cmp, depth_limit); | |
210 | + _psort(a, r, es, cmp, depth_limit, NULL); | |
211 | #endif | |
212 | + } | |
213 | + } | |
214 | if ((r = pd - pc) > es) { | |
215 | /* Iterate rather than recurse to save stack space */ | |
216 | a = pn - r; | |
217 | n = r / es; | |
218 | goto loop; | |
219 | } | |
220 | -/* qsort(pn - r, r / es, es, cmp);*/ | |
221 | +/* psort(pn - r, r / es, es, cmp);*/ | |
222 | +} | |
223 | + | |
224 | +static void | |
225 | +_psort_parallel(void *x) | |
226 | +{ | |
227 | + union args *args = (union args *)x; | |
228 | + struct shared *shared = args->shared; | |
229 | + | |
230 | + _psort(args->a, args->n, shared->es, | |
231 | +#ifdef I_AM_PSORT_R | |
232 | + shared->thunk, | |
233 | +#endif | |
234 | + shared->cmp, args->depth_limit, shared); | |
235 | + returnargs(shared, args); | |
236 | + if(OSAtomicDecrement32(&shared->count) <= 0) { | |
237 | + pthread_mutex_lock(&shared->mutex); | |
238 | + pthread_cond_signal(&shared->cond); | |
239 | + pthread_mutex_unlock(&shared->mutex); | |
240 | + } | |
241 | +} | |
242 | + | |
243 | +/* fast, approximate integer square root */ | |
244 | +static size_t | |
245 | +isqrt(size_t x) | |
246 | +{ | |
247 | + size_t s = 1L << (flsl(x) / 2); | |
248 | + return (s + x / s) / 2; | |
249 | } | |
250 | ||
251 | void | |
252 | -#ifdef I_AM_QSORT_R | |
253 | -qsort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp) | |
254 | -#elif defined(I_AM_QSORT_B) | |
255 | -qsort_b(void *a, size_t n, size_t es, cmp_t ^cmp) | |
256 | +#ifdef I_AM_PSORT_R | |
257 | +psort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp) | |
258 | +#elif defined(I_AM_PSORT_B) | |
259 | +psort_b(void *a, size_t n, size_t es, cmp_t ^cmp) | |
260 | #else | |
261 | -qsort(void *a, size_t n, size_t es, cmp_t *cmp) | |
262 | +psort(void *a, size_t n, size_t es, cmp_t *cmp) | |
263 | #endif | |
264 | { | |
265 | - _qsort(a, n, es, | |
266 | -#ifdef I_AM_QSORT_R | |
267 | - thunk, | |
268 | + if (n >= PARALLEL_MIN_SIZE && _NumCPUs() > 1) { | |
269 | + struct shared shared; | |
270 | + union args *args; | |
271 | + | |
272 | + bzero(&shared, sizeof(shared)); | |
273 | + shared.sharedlock = OS_SPINLOCK_INIT; | |
274 | + if ((args = getargs(&shared)) != NULL) { | |
275 | + struct page *p, *pp; | |
276 | +#ifdef I_AM_PSORT_R | |
277 | + shared.who = "psort_r"; | |
278 | + shared.thunk = thunk; | |
279 | +#elif defined(I_AM_PSORT_B) | |
280 | + shared.who = "psort_b"; | |
281 | +#else | |
282 | + shared.who = "psort"; | |
283 | +#endif | |
284 | + shared.cmp = cmp; | |
285 | + shared.es = es; | |
286 | + shared.queue = dispatch_get_concurrent_queue(0); | |
287 | + shared.cond = (pthread_cond_t)PTHREAD_COND_INITIALIZER; | |
288 | + shared.mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; | |
289 | + args->a = a; | |
290 | + args->n = n; | |
291 | + args->depth_limit = DEPTH(n); | |
292 | + args->shared = &shared; | |
293 | + /* | |
294 | + * The turnoff value is the size of a partition that, | |
295 | + * below which, we stop doing in parallel, and just do | |
296 | + * in the current thread. The value of sqrt(n) was | |
297 | + * determined heuristically. There is a smaller | |
298 | + * dependence on the slowness of the comparison | |
299 | + * function, and there might be a dependence on the | |
300 | + * number of processors, but the algorithm has not been | |
301 | + * determined. Because the sensitivity to the turnoff | |
302 | + * value is relatively low, we use a fast, approximate | |
303 | + * integer square root routine that is good enough for | |
304 | + * this purpose. | |
305 | + */ | |
306 | + shared.turnoff = isqrt(n); | |
307 | + OSAtomicIncrement32(&shared.count); | |
308 | + _psort_parallel(args); | |
309 | + | |
310 | + /* wait for queue to drain */ | |
311 | + pthread_mutex_lock(&shared.mutex); | |
312 | + while(shared.count > 0) | |
313 | + pthread_cond_wait(&shared.cond, &shared.mutex); | |
314 | + | |
315 | + pthread_mutex_unlock(&shared.mutex); | |
316 | + pthread_mutex_destroy(&shared.mutex); | |
317 | + pthread_cond_destroy(&shared.cond); | |
318 | + for(p = shared.pagelist; p; p = pp) { | |
319 | + pp = p->next; | |
320 | + munmap(p, PAGESIZE); | |
321 | + } | |
322 | + return; | |
323 | + } | |
324 | + } | |
325 | + /* Just call qsort */ | |
326 | +#ifdef I_AM_PSORT_R | |
327 | + qsort_r(a, n, es, thunk, cmp); | |
328 | +#elif defined(I_AM_PSORT_B) | |
329 | + qsort_b(a, n, es, cmp); | |
330 | +#else | |
331 | + qsort(a, n, es, cmp); | |
332 | #endif | |
333 | - cmp, DEPTH(n)); | |
334 | } |