From 3b2a1fe8d3d02703ddca1b0ead469074d4e47820 Mon Sep 17 00:00:00 2001 From: Apple Date: Wed, 29 Aug 2001 23:32:14 +0000 Subject: [PATCH] Libc-186.tar.gz --- Makefile.postamble | 1 + gen.subproj/crypt.c | 46 +- gen.subproj/popen.c | 57 +- gen.subproj/ppc.subproj/Makefile | 8 +- gen.subproj/ppc.subproj/PB.project | 4 +- gen.subproj/ppc.subproj/bcopy.s | 410 ----------- gen.subproj/ppc.subproj/blockmoof.s | 940 ++++++++++++++++++++++++++ gen.subproj/ppc.subproj/memcpy.s | 23 - gen.subproj/ppc.subproj/memmove.s | 23 - gen.subproj/scalable_malloc.c | 42 +- locale.subproj/rune.c | 2 +- locale.subproj/setlocale.c | 2 +- mach.subproj/mach_init.c | 6 +- pthreads.subproj/pthread.c | 100 ++- pthreads.subproj/pthread_cond.c | 3 +- pthreads.subproj/pthread_internals.h | 35 +- pthreads.subproj/pthread_mutex.c | 4 +- stdio.subproj/vfprintf.c | 53 +- stdio.subproj/vfscanf.c | 38 +- stdlib.subproj/strtod.c | 46 +- string.subproj/memccpy.c | 5 +- sys.subproj/gettimeofday.c | 25 +- sys.subproj/i386.subproj/vfork.s | 21 + sys.subproj/ppc.subproj/_longjmp.s | 150 +++- sys.subproj/ppc.subproj/_setjmp.h | 18 + sys.subproj/ppc.subproj/_setjmp.s | 85 ++- sys.subproj/ppc.subproj/ur_cthread.s | 4 +- sys.subproj/ppc.subproj/vfork.s | 2 +- threads.subproj/Makefile | 2 +- threads.subproj/PB.project | 2 +- threads.subproj/i386.subproj/thread.c | 2 +- threads.subproj/threads_data.c | 44 -- util.subproj/pty.c | 2 +- 33 files changed, 1485 insertions(+), 720 deletions(-) delete mode 100644 gen.subproj/ppc.subproj/bcopy.s create mode 100755 gen.subproj/ppc.subproj/blockmoof.s delete mode 100644 gen.subproj/ppc.subproj/memcpy.s delete mode 100644 gen.subproj/ppc.subproj/memmove.s delete mode 100644 threads.subproj/threads_data.c diff --git a/Makefile.postamble b/Makefile.postamble index e702b54..2083c13 100644 --- a/Makefile.postamble +++ b/Makefile.postamble @@ -5,6 +5,7 @@ PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(DEBUG_SUFFIX)$(LIBRARY_EXT) PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(PROFILE_SUFFIX)$(LIBRARY_EXT) PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(STATIC_SUFFIX)$(LIBRARY_EXT) RECURSIVE_FLAGS += "LINK_SUBPROJECTS = NO" +OTHER_CFLAGS += -no-cpp-precomp -force_cpusubtype_ALL static: $(SILENT) unset $(CUMULATIVE_VARIABLES) ||: ; \ diff --git a/gen.subproj/crypt.c b/gen.subproj/crypt.c index 5e2caec..2f56953 100644 --- a/gen.subproj/crypt.c +++ b/gen.subproj/crypt.c @@ -59,6 +59,7 @@ #include #include #include +#include /* * UNIX password, and DES, encryption. @@ -465,19 +466,24 @@ static unsigned char itoa64[] = /* 0..63 => ascii-64 */ static unsigned char a64toi[128]; /* ascii-64 => 0..63 */ /* Initial key schedule permutation */ -static C_block PC1ROT[64/CHUNKBITS][1< final permutation table */ -static C_block CF6464[64/CHUNKBITS][1<> 1) & 0x55555555L; L1 = R0 | R1; /* L1 is the odd-numbered input bits */ STORE(L,L0,L1,B); - PERM3264(L,L0,L1,B.b, (C_block *)IE3264); /* even bits */ - PERM3264(R,R0,R1,B.b+4,(C_block *)IE3264); /* odd bits */ + PERM3264(L,L0,L1,B.b,IE3264); /* even bits */ + PERM3264(R,R0,R1,B.b+4,IE3264); /* odd bits */ if (num_iter >= 0) { /* encryption */ @@ -689,14 +695,14 @@ STATIC int des_cipher(in, out, salt, num_iter) #define SPTAB(t, i) (*(long *)((unsigned char *)t + i*(sizeof(long)/4))) #if defined(gould) /* use this if B.b[i] is evaluated just once ... */ -#define DOXOR(x,y,i) x^=SPTAB(SPE[0][i],B.b[i]); y^=SPTAB(SPE[1][i],B.b[i]); +#define DOXOR(x,y,i) x^=SPTAB(&SPE[i * 64],B.b[i]); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],B.b[i]); #else #if defined(pdp11) /* use this if your "long" int indexing is slow */ -#define DOXOR(x,y,i) j=B.b[i]; x^=SPTAB(SPE[0][i],j); y^=SPTAB(SPE[1][i],j); +#define DOXOR(x,y,i) j=B.b[i]; x^=SPTAB(&SPE[i * 64],j); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],j); #else /* use this if "k" is allocated to a register ... */ -#define DOXOR(x,y,i) k=B.b[i]; x^=SPTAB(SPE[0][i],k); y^=SPTAB(SPE[1][i],k); +#define DOXOR(x,y,i) k=B.b[i]; x^=SPTAB(&SPE[i * 64],k); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],k); #endif #endif @@ -731,7 +737,7 @@ STATIC int des_cipher(in, out, salt, num_iter) L0 = ((L0 >> 3) & 0x0f0f0f0fL) | ((L1 << 1) & 0xf0f0f0f0L); L1 = ((R0 >> 3) & 0x0f0f0f0fL) | ((R1 << 1) & 0xf0f0f0f0L); STORE(L,L0,L1,B); - PERM6464(L,L0,L1,B.b, (C_block *)CF6464); + PERM6464(L,L0,L1,B.b,CF6464); #if defined(MUST_ALIGN) STORE(L,L0,L1,B); out[0] = B.b[0]; out[1] = B.b[1]; out[2] = B.b[2]; out[3] = B.b[3]; @@ -781,6 +787,9 @@ STATIC void init_des() #ifdef DEBUG prtab("pc1tab", perm, 8); #endif + PC1ROT = (C_block *)calloc(sizeof(C_block), (64/CHUNKBITS) * (1<= 0; ) k = (k<<1) | tmp32[perm[i]-1]; - TO_SIX_BIT(SPE[0][tableno][j], k); + TO_SIX_BIT(SPE[(tableno * 64) + j], k); k = 0; for (i = 24; --i >= 0; ) k = (k<<1) | tmp32[perm[i+24]-1]; - TO_SIX_BIT(SPE[1][tableno][j], k); + TO_SIX_BIT(SPE[(8 * 64) + (tableno * 64) + j], k); } } } @@ -891,7 +903,7 @@ STATIC void init_des() * "perm" must be all-zeroes on entry to this routine. */ STATIC void init_perm(perm, p, chars_in, chars_out) - C_block perm[64/CHUNKBITS][1<>3] |= 1<<(k&07); + perm[(i * (1<>3] |= 1<<(k&07); } } } diff --git a/gen.subproj/popen.c b/gen.subproj/popen.c index 7729280..885d6c6 100644 --- a/gen.subproj/popen.c +++ b/gen.subproj/popen.c @@ -2,13 +2,13 @@ * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ - * + * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. - * + * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -16,7 +16,7 @@ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. - * + * * @APPLE_LICENSE_HEADER_END@ */ /* @@ -55,7 +55,6 @@ * SUCH DAMAGE. */ - #include #include #include @@ -67,6 +66,9 @@ #include #include #include +#include + +#define environ *(_NSGetEnviron()) static struct pid { struct pid *next; @@ -81,38 +83,57 @@ popen(command, type) struct pid *cur; FILE *iop; int pdes[2], pid, twoway; + char *argv[4]; + struct pid *p; if (strchr(type, '+')) { twoway = 1; type = "r+"; - if (socketpair(AF_UNIX, SOCK_STREAM, 0, pdes) < 0) - return (NULL); + if (socketpair(AF_UNIX, SOCK_STREAM, 0, pdes) < 0) + return (NULL); } else { twoway = 0; - if (*type != 'r' && *type != 'w' || type[1] || - (pipe(pdes) < 0)) + if ((*type != 'r' && *type != 'w') || type[1]) return (NULL); } + if (pipe(pdes) < 0) + return (NULL); - if ((cur = malloc(sizeof(struct pid))) == NULL) + if ((cur = malloc(sizeof(struct pid))) == NULL) { + (void)close(pdes[0]); + (void)close(pdes[1]); return (NULL); + } + + argv[0] = "sh"; + argv[1] = "-c"; + argv[2] = (char *)command; + argv[3] = NULL; switch (pid = vfork()) { case -1: /* Error. */ (void)close(pdes[0]); (void)close(pdes[1]); - (void)free(cur); + free(cur); return (NULL); /* NOTREACHED */ case 0: /* Child. */ if (*type == 'r') { + /* + * The _dup2() to STDIN_FILENO is repeated to avoid + * writing to pdes[1], which might corrupt the + * parent's copy. This isn't good enough in + * general, since the _exit() is no return, so + * the compiler is free to corrupt all the local + * variables. + */ + (void)close(pdes[0]); if (pdes[1] != STDOUT_FILENO) { (void)dup2(pdes[1], STDOUT_FILENO); (void)close(pdes[1]); - pdes[1] = STDOUT_FILENO; - } - (void) close(pdes[0]); - if (twoway && (pdes[1] != STDIN_FILENO)) + if (twoway) + (void)dup2(STDOUT_FILENO, STDIN_FILENO); + } else if (twoway && (pdes[1] != STDIN_FILENO)) (void)dup2(pdes[1], STDIN_FILENO); } else { if (pdes[0] != STDIN_FILENO) { @@ -120,8 +141,11 @@ popen(command, type) (void)close(pdes[0]); } (void)close(pdes[1]); + } + for (p = pidlist; p; p = p->next) { + (void)close(fileno(p->fp)); } - execl(_PATH_BSHELL, "sh", "-c", command, NULL); + execve(_PATH_BSHELL, argv, environ); _exit(127); /* NOTREACHED */ } @@ -154,7 +178,6 @@ pclose(iop) FILE *iop; { register struct pid *cur, *last; - int omask; int pstat; pid_t pid; @@ -168,7 +191,7 @@ pclose(iop) (void)fclose(iop); do { - pid = waitpid(cur->pid, &pstat, 0); + pid = wait4(cur->pid, &pstat, 0, (struct rusage *)0); } while (pid == -1 && errno == EINTR); /* Remove the entry from the linked list. */ diff --git a/gen.subproj/ppc.subproj/Makefile b/gen.subproj/ppc.subproj/Makefile index 2a0ec70..0c95a7b 100644 --- a/gen.subproj/ppc.subproj/Makefile +++ b/gen.subproj/ppc.subproj/Makefile @@ -14,16 +14,16 @@ PROJECT_TYPE = Component HFILES = fp.h genassym.h -OTHERLINKED = abs.s bcopy.s bzero.s ffs.s mcount.s memcpy.s\ - memmove.s strlen.s +OTHERLINKED = abs.s blockmoof.s bzero.s ffs.s mcount.s \ + strlen.s CFILES = bcmp.c ecvt.c insque.c isinf.c remque.c setjmperr.c\ strcat.c strcpy.c strncat.c strncmp.c strncpy.c OTHERSRCS = Makefile.preamble Makefile Makefile.postamble -OTHERLINKEDOFILES = abs.o bcopy.o bzero.o ffs.o mcount.o memcpy.o\ - memmove.o strlen.o +OTHERLINKEDOFILES = abs.o blockmoof.o bzero.o ffs.o mcount.o \ + strlen.o MAKEFILEDIR = $(MAKEFILEPATH)/pb_makefiles CODE_GEN_STYLE = DYNAMIC diff --git a/gen.subproj/ppc.subproj/PB.project b/gen.subproj/ppc.subproj/PB.project index d1d8013..6fec101 100644 --- a/gen.subproj/ppc.subproj/PB.project +++ b/gen.subproj/ppc.subproj/PB.project @@ -5,15 +5,13 @@ OTHER_LINKED = ( abs.s, bcmp.c, - bcopy.s, + blockmoof.s, bzero.s, ecvt.c, ffs.s, insque.c, isinf.c, mcount.s, - memcpy.s, - memmove.s, remque.c, setjmperr.c, strcat.c, diff --git a/gen.subproj/ppc.subproj/bcopy.s b/gen.subproj/ppc.subproj/bcopy.s deleted file mode 100644 index 38ffd42..0000000 --- a/gen.subproj/ppc.subproj/bcopy.s +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -; -; Copy bytes of data around. handles overlapped data. -; -; Change this to use Altivec later on -; - -; -; void bcopy(from, to, nbytes) -; - -; Use CR5_lt to indicate non-cached -#define noncache 20 -.text -.align 2 -#if !defined(MEMCPY) && !defined(MEMMOVE) -.globl _bcopy -_bcopy: - crclr noncache ; Set cached - cmplw cr1,r4,r3 ; Compare "to" and "from" - mr. r5,r5 ; Check if we have a 0 length - mr r6,r3 ; Set source - beqlr- cr1 ; Bail if "to" and "from" are the same - beqlr- ; Bail if length is 0 - b Lcopyit ; Go copy it... - -; -; When we move the memory, forward overlays must be handled. We -; also can not use the cache instructions if we are from bcopy_nc. -; We need to preserve R3 because it needs to be returned for memcpy. -; We can be interrupted and lose control here. -; -; There is no stack, so in order to used floating point, we would -; need to take the FP exception. Any potential gains by using FP -; would be more than eaten up by this. -; -; Later, we should used Altivec for large moves. -; - -#else -#if defined(MEMCPY) -.globl _memcpy -_memcpy: -#endif - -#if defined(MEMMOVE) -.globl _memmove -_memmove: -#endif - cmplw cr1,r3,r4 ; "to" and "from" the same? - mr r6,r4 ; Set the "from" - mr. r5,r5 ; Length zero? - crclr noncache ; Set cached - mr r4,r3 ; Set the "to" - beqlr- cr1 ; "to" and "from" are the same - beqlr- ; Length is 0 -#endif -Lcopyit: sub r12,r4,r6 ; Get potential overlap (negative if backward move) - lis r8,0x7FFF ; Start up a mask - srawi r11,r12,31 ; Propagate the sign bit - dcbt 0,r6 ; Touch in the first source line - cntlzw r7,r5 ; Get the highest power of 2 factor of the length - ori r8,r8,0xFFFF ; Make limit 0x7FFFFFFF - xor r9,r12,r11 ; If sink - source was negative, invert bits - srw r8,r8,r7 ; Get move length limitation - sub r9,r9,r11 ; If sink - source was negative, add 1 and get absolute value - cmplw r12,r5 ; See if we actually forward overlap - cmplwi cr7,r9,32 ; See if at least a line between source and sink - dcbtst 0,r4 ; Touch in the first sink line - cmplwi cr1,r5,32 ; Are we moving more than a line? - cror noncache,noncache,28 ; Set to not DCBZ output line if not enough space - blt- Lfwdovrlap ; This is a forward overlapping area, handle it... - -; -; R4 = sink -; R5 = length -; R6 = source -; - -; -; Here we figure out how much we have to move to get the sink onto a -; cache boundary. If we can, and there are still more that 32 bytes -; left to move, we can really speed things up by DCBZing the sink line. -; We can not do this if noncache is set because we will take an -; alignment exception. - - neg r0,r4 ; Get the number of bytes to move to align to a line boundary - rlwinm. r0,r0,0,27,31 ; Clean it up and test it - and r0,r0,r8 ; limit to the maximum front end move - mtcrf 3,r0 ; Make branch mask for partial moves - sub r5,r5,r0 ; Set the length left to move - beq Lalline ; Already on a line... - - bf 31,Lalhalf ; No single byte to do... - lbz r7,0(r6) ; Get the byte - addi r6,r6,1 ; Point to the next - stb r7,0(r4) ; Save the single - addi r4,r4,1 ; Bump sink - -; Sink is halfword aligned here - -Lalhalf: bf 30,Lalword ; No halfword to do... - lhz r7,0(r6) ; Get the halfword - addi r6,r6,2 ; Point to the next - sth r7,0(r4) ; Save the halfword - addi r4,r4,2 ; Bump sink - -; Sink is word aligned here - -Lalword: bf 29,Laldouble ; No word to do... - lwz r7,0(r6) ; Get the word - addi r6,r6,4 ; Point to the next - stw r7,0(r4) ; Save the word - addi r4,r4,4 ; Bump sink - -; Sink is double aligned here - -Laldouble: bf 28,Lalquad ; No double to do... - lwz r7,0(r6) ; Get the first word - lwz r8,4(r6) ; Get the second word - addi r6,r6,8 ; Point to the next - stw r7,0(r4) ; Save the first word - stw r8,4(r4) ; Save the second word - addi r4,r4,8 ; Bump sink - -; Sink is quadword aligned here - -Lalquad: bf 27,Lalline ; No quad to do... - lwz r7,0(r6) ; Get the first word - lwz r8,4(r6) ; Get the second word - lwz r9,8(r6) ; Get the third word - stw r7,0(r4) ; Save the first word - lwz r11,12(r6) ; Get the fourth word - addi r6,r6,16 ; Point to the next - stw r8,4(r4) ; Save the second word - stw r9,8(r4) ; Save the third word - stw r11,12(r4) ; Save the fourth word - addi r4,r4,16 ; Bump sink - -; Sink is line aligned here - -Lalline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move - mtcrf 3,r5 ; Make branch mask for backend partial moves - rlwinm r11,r5,0,0,26 ; Get number of bytes to move - beq- Lbackend ; No full lines to move - - sub r5,r5,r11 ; Calculate the residual - li r10,96 ; Stride for touch ahead - -Lnxtline: subic. r0,r0,1 ; Account for the line now - - bt- noncache,Lskipz ; Skip if we are not cached... - dcbz 0,r4 ; Blow away the whole line because we are replacing it - dcbt r6,r10 ; Touch ahead a bit - -Lskipz: lwz r7,0(r6) ; Get the first word - lwz r8,4(r6) ; Get the second word - lwz r9,8(r6) ; Get the third word - stw r7,0(r4) ; Save the first word - lwz r11,12(r6) ; Get the fourth word - stw r8,4(r4) ; Save the second word - lwz r7,16(r6) ; Get the fifth word - stw r9,8(r4) ; Save the third word - lwz r8,20(r6) ; Get the sixth word - stw r11,12(r4) ; Save the fourth word - lwz r9,24(r6) ; Get the seventh word - stw r7,16(r4) ; Save the fifth word - lwz r11,28(r6) ; Get the eighth word - addi r6,r6,32 ; Point to the next - stw r8,20(r4) ; Save the sixth word - stw r9,24(r4) ; Save the seventh word - stw r11,28(r4) ; Save the eighth word - addi r4,r4,32 ; Bump sink - bgt+ Lnxtline ; Do the next line, if any... - - -; Move backend quadword - -Lbackend: bf 27,Lnoquad ; No quad to do... - lwz r7,0(r6) ; Get the first word - lwz r8,4(r6) ; Get the second word - lwz r9,8(r6) ; Get the third word - lwz r11,12(r6) ; Get the fourth word - stw r7,0(r4) ; Save the first word - addi r6,r6,16 ; Point to the next - stw r8,4(r4) ; Save the second word - stw r9,8(r4) ; Save the third word - stw r11,12(r4) ; Save the fourth word - addi r4,r4,16 ; Bump sink - -; Move backend double - -Lnoquad: bf 28,Lnodouble ; No double to do... - lwz r7,0(r6) ; Get the first word - lwz r8,4(r6) ; Get the second word - addi r6,r6,8 ; Point to the next - stw r7,0(r4) ; Save the first word - stw r8,4(r4) ; Save the second word - addi r4,r4,8 ; Bump sink - -; Move backend word - -Lnodouble: bf 29,Lnoword ; No word to do... - lwz r7,0(r6) ; Get the word - addi r6,r6,4 ; Point to the next - stw r7,0(r4) ; Save the word - addi r4,r4,4 ; Bump sink - -; Move backend halfword - -Lnoword: bf 30,Lnohalf ; No halfword to do... - lhz r7,0(r6) ; Get the halfword - addi r6,r6,2 ; Point to the next - sth r7,0(r4) ; Save the halfword - addi r4,r4,2 ; Bump sink - -; Move backend byte - -Lnohalf: bflr 31 ; Leave cuz we are all done... - lbz r7,0(r6) ; Get the byte - stb r7,0(r4) ; Save the single - - blr ; Leave cuz we are all done... - -; -; 0123456789ABCDEF0123456789ABCDEF -; 0123456789ABCDEF0123456789ABCDEF -; F -; DE -; 9ABC -; 12345678 -; 123456789ABCDEF0 -; 0 - -; -; Here is where we handle a forward overlapping move. These will be slow -; because we can not kill the cache of the destination until after we have -; loaded/saved the source area. Also, because reading memory backwards is -; slower when the cache line needs to be loaded because the critical -; doubleword is loaded first, i.e., the last, then it goes back to the first, -; and on in order. That means that when we are at the second to last DW we -; have to wait until the whole line is in cache before we can proceed. -; - -Lfwdovrlap: add r4,r5,r4 ; Point past the last sink byte - add r6,r5,r6 ; Point past the last source byte - and r0,r4,r8 ; Apply movement limit - li r12,-1 ; Make sure we touch in the actual line - mtcrf 3,r0 ; Figure out the best way to move backwards - dcbt r12,r6 ; Touch in the last line of source - rlwinm. r0,r0,0,27,31 ; Calculate the length to adjust to cache boundary - dcbtst r12,r4 ; Touch in the last line of the sink - beq- Lballine ; Aready on cache line boundary - - sub r5,r5,r0 ; Precaculate move length left after alignment - - bf 31,Lbalhalf ; No single byte to do... - lbz r7,-1(r6) ; Get the byte - subi r6,r6,1 ; Point to the next - stb r7,-1(r4) ; Save the single - subi r4,r4,1 ; Bump sink - -; Sink is halfword aligned here - -Lbalhalf: bf 30,Lbalword ; No halfword to do... - lhz r7,-2(r6) ; Get the halfword - subi r6,r6,2 ; Point to the next - sth r7,-2(r4) ; Save the halfword - subi r4,r4,2 ; Bump sink - -; Sink is word aligned here - -Lbalword: bf 29,Lbaldouble ; No word to do... - lwz r7,-4(r6) ; Get the word - subi r6,r6,4 ; Point to the next - stw r7,-4(r4) ; Save the word - subi r4,r4,4 ; Bump sink - -; Sink is double aligned here - -Lbaldouble: bf 28,Lbalquad ; No double to do... - lwz r7,-8(r6) ; Get the first word - lwz r8,-4(r6) ; Get the second word - subi r6,r6,8 ; Point to the next - stw r7,-8(r4) ; Save the first word - stw r8,-4(r4) ; Save the second word - subi r4,r4,8 ; Bump sink - -; Sink is quadword aligned here - -Lbalquad: bf 27,Lballine ; No quad to do... - lwz r7,-16(r6) ; Get the first word - lwz r8,-12(r6) ; Get the second word - lwz r9,-8(r6) ; Get the third word - lwz r11,-4(r6) ; Get the fourth word - stw r7,-16(r4) ; Save the first word - subi r6,r6,16 ; Point to the next - stw r8,-12(r4) ; Save the second word - stw r9,-8(r4) ; Save the third word - stw r11,-4(r4) ; Save the fourth word - subi r4,r4,16 ; Bump sink - -; Sink is line aligned here - -Lballine: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move - mtcrf 3,r5 ; Make branch mask for backend partial moves - beq- Lbbackend ; No full lines to move - - -; Registers in use: R0, R1, R3, R4, R5, R6 -; Registers not in use: R2, R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them - -Lbnxtline: subic. r0,r0,1 ; Account for the line now - - lwz r7,-32(r6) ; Get the first word - lwz r5,-28(r6) ; Get the second word - lwz r2,-24(r6) ; Get the third word - lwz r12,-20(r6) ; Get the third word - lwz r11,-16(r6) ; Get the fifth word - lwz r10,-12(r6) ; Get the sixth word - lwz r9,-8(r6) ; Get the seventh word - lwz r8,-4(r6) ; Get the eighth word - subi r6,r6,32 ; Point to the next - - stw r7,-32(r4) ; Get the first word - ble- Lbnotouch ; Last time, skip touch of source... - dcbt 0,r6 ; Touch in next source line - -Lbnotouch: stw r5,-28(r4) ; Get the second word - stw r2,-24(r4) ; Get the third word - stw r12,-20(r4) ; Get the third word - stw r11,-16(r4) ; Get the fifth word - stw r10,-12(r4) ; Get the sixth word - stw r9,-8(r4) ; Get the seventh word - stw r8,-4(r4) ; Get the eighth word - subi r4,r4,32 ; Bump sink - - bgt+ Lbnxtline ; Do the next line, if any... - -; -; Note: We touched these lines in at the beginning -; - -; Move backend quadword - -Lbbackend: bf 27,Lbnoquad ; No quad to do... - lwz r7,-16(r6) ; Get the first word - lwz r8,-12(r6) ; Get the second word - lwz r9,-8(r6) ; Get the third word - lwz r11,-4(r6) ; Get the fourth word - stw r7,-16(r4) ; Save the first word - subi r6,r6,16 ; Point to the next - stw r8,-12(r4) ; Save the second word - stw r9,-8(r4) ; Save the third word - stw r11,-4(r4) ; Save the fourth word - subi r4,r4,16 ; Bump sink - -; Move backend double - -Lbnoquad: bf 28,Lbnodouble ; No double to do... - lwz r7,-8(r6) ; Get the first word - lwz r8,-4(r6) ; Get the second word - subi r6,r6,8 ; Point to the next - stw r7,-8(r4) ; Save the first word - stw r8,-4(r4) ; Save the second word - subi r4,r4,8 ; Bump sink - -; Move backend word - -Lbnodouble: bf 29,Lbnoword ; No word to do... - lwz r7,-4(r6) ; Get the word - subi r6,r6,4 ; Point to the next - stw r7,-4(r4) ; Save the word - subi r4,r4,4 ; Bump sink - -; Move backend halfword - -Lbnoword: bf 30,Lbnohalf ; No halfword to do... - lhz r7,-2(r6) ; Get the halfword - subi r6,r6,2 ; Point to the next - sth r7,-2(r4) ; Save the halfword - subi r4,r4,2 ; Bump sink - -; Move backend byte - -Lbnohalf: bflr 31 ; Leave cuz we are all done... - lbz r7,-1(r6) ; Get the byte - stb r7,-1(r4) ; Save the single - - blr ; Leave cuz we are all done... diff --git a/gen.subproj/ppc.subproj/blockmoof.s b/gen.subproj/ppc.subproj/blockmoof.s new file mode 100755 index 0000000..947e7f0 --- /dev/null +++ b/gen.subproj/ppc.subproj/blockmoof.s @@ -0,0 +1,940 @@ +/* + * Copyright (c) 1992-2001 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include + +// ================================================================================================= +// *** The easiest way to assemble things on Mac OS X is via "cc", so this uses #defines and such. +// ================================================================================================= + +// Keep track of whether we have Altivec +// This gets set in pthread_init() + +.data +.align 2 +.globl __cpu_has_altivec +__cpu_has_altivec: +.long 0 + +.text +.align 2 +.globl _bcopy +.globl _memcpy +.globl _memmove + +_bcopy: + mr r2,r4 // Since bcopy uses (src,dest,count), swap r3,r4 + mr r4,r3 + mr r3,r2 +_memcpy: +_memmove: + mr r2,r3 // Store dest ptr in r2 to preserve r3 on return + +// ------------------ +// Standard registers + +#define rs r4 +#define rd r2 +#define rc r5 + +// Should we bother using Altivec? + + cmpwi r5, 128 + blt+ LScalar + +// Determine whether we have Altivec enabled + + mflr r0 + bcl 20,31,1f +1: + mflr r6 + mtlr r0 + addis r6, r6, ha16(__cpu_has_altivec - 1b) + lwz r6, lo16(__cpu_has_altivec - 1b)(r6) + cmpwi r6, 0 + bne+ LAltivec + +// ================================================================================================= + +// ***************************************** +// * S c a l a r B l o c k M o o f D a t a * +// ***************************************** +// +// This is the scalar (non-AltiVec) version of BlockMoofData. +// +// void ScalarBlockMoofData (ptr sou, ptr dest, long len) +// void ScalarBlockMoofDataUncached (ptr sou, ptr dest, long len) +// +// +// Calling Sequence: r3 = source pointer +// r4 = destination pointer +// r5 = length in bytes +// +// Uses: all volatile registers. + +LScalar: + cmplwi cr7,rc,32 // length <= 32 bytes? + cmplw cr6,rd,rs // up or down? + mr. r0,rc // copy to r0 for MoveShort, and test for negative + bgt cr7,Lbm1 // skip if count > 32 + +// Handle short moves (<=32 bytes.) + + beq cr7,LMove32 // special case 32-byte blocks + blt cr6,LMoveDownShort // move down in memory and return + add rs,rs,rc // moving up (right-to-left), so adjust pointers + add rd,rd,rc + b LMoveUpShort // move up in memory and return + +// Handle long moves (>32 bytes.) + +Lbm1: + beqlr cr6 // rs==rd, so nothing to move + bltlr cr0 // length<0, so ignore call and return + mflr r12 // save return address + bge cr6,Lbm2 // rd>=rs, so move up + +// Long moves down (left-to-right.) + + neg r6,rd // start to 32-byte-align destination + andi. r0,r6,0x1F // r0 <- bytes to move to align destination + bnel LMoveDownShort // align destination if necessary + bl LMoveDownLong // move 32-byte chunks down + andi. r0,rc,0x1F // done? + mtlr r12 // restore caller's return address + bne LMoveDownShort // move trailing leftover bytes and done + blr // no leftovers, so done + +// Long moves up (right-to-left.) + +Lbm2: + add rs,rs,rc // moving up (right-to-left), so adjust pointers + add rd,rd,rc + andi. r0,rd,0x1F // r0 <- bytes to move to align destination + bnel LMoveUpShort // align destination if necessary + bl LMoveUpLong // move 32-byte chunks up + andi. r0,rc,0x1F // done? + mtlr r12 // restore caller's return address + bne LMoveUpShort // move trailing leftover bytes and done + blr // no leftovers, so done + +// *************** +// * M O V E 3 2 * +// *************** +// +// Special case subroutine to move a 32-byte block. MoveDownShort and +// MoveUpShort only handle 0..31 bytes, and we believe 32 bytes is too +// common a case to send it through the general purpose long-block code. +// Since it moves both up and down, we must load all 32 bytes before +// storing any. +// +// Calling Sequence: rs = source ptr +// rd = destination ptr +// +// Uses: r0,r5-r11. +// + +LMove32: + lwz r0,0(rs) + lwz r5,4(rs) + lwz r6,8(rs) + lwz r7,12(rs) + lwz r8,16(rs) + lwz r9,20(rs) + lwz r10,24(rs) + lwz r11,28(rs) + stw r0,0(rd) + stw r5,4(rd) + stw r6,8(rd) + stw r7,12(rd) + stw r8,16(rd) + stw r9,20(rd) + stw r10,24(rd) + stw r11,28(rd) + blr + + +// ************************* +// * M o v e U p S h o r t * +// ************************* +// +// Subroutine called to move <32 bytes up in memory (ie, right-to-left). +// +// Entry conditions: rs = last byte moved from source (right-to-left) +// rd = last byte moved into destination +// r0 = #bytes to move (0..31) +// +// Exit conditions: rs = updated source ptr +// rd = updated destination ptr +// rc = decremented by #bytes moved +// +// Uses: r0,r6,r7,r8,cr7. +// + +LMoveUpShort: + andi. r6,r0,0x10 // test 0x10 bit in length + mtcrf 0x1,r0 // move count to cr7 so we can test bits + sub rc,rc,r0 // decrement count of bytes remaining to be moved + beq Lmus1 // skip if 0x10 bit in length is 0 + lwzu r0,-16(rs) // set, so copy up 16 bytes + lwz r6,4(rs) + lwz r7,8(rs) + lwz r8,12(rs) + stwu r0,-16(rd) + stw r6,4(rd) + stw r7,8(rd) + stw r8,12(rd) + +Lmus1: + bf 28,Lmus2 // test 0x08 bit + lwzu r0,-8(rs) + lwz r6,4(rs) + stwu r0,-8(rd) + stw r6,4(rd) + +Lmus2: + bf 29,Lmus3 // test 0x4 bit + lwzu r0,-4(rs) + stwu r0,-4(rd) + +Lmus3: + bf 30,Lmus4 // test 0x2 bit + lhzu r0,-2(rs) + sthu r0,-2(rd) + +Lmus4: + bflr 31 // test 0x1 bit, return if 0 + lbzu r0,-1(rs) + stbu r0,-1(rd) + blr + + +// ***************************** +// * M o v e D o w n S h o r t * +// ***************************** +// +// Subroutine called to move <32 bytes down in memory (ie, left-to-right). +// +// Entry conditions: rs = source pointer +// rd = destination pointer +// r0 = #bytes to move (0..31) +// +// Exit conditions: rs = ptr to 1st byte not moved +// rd = ptr to 1st byte not moved +// rc = decremented by #bytes moved +// +// Uses: r0,r6,r7,r8,cr7. +// + +LMoveDownShort: + andi. r6,r0,0x10 // test 0x10 bit in length + mtcrf 0x1,r0 // move count to cr7 so we can test bits + sub rc,rc,r0 // decrement count of bytes remaining to be moved + beq Lmds1 // skip if 0x10 bit in length is 0 + lwz r0,0(rs) // set, so copy up 16 bytes + lwz r6,4(rs) + lwz r7,8(rs) + lwz r8,12(rs) + addi rs,rs,16 + stw r0,0(rd) + stw r6,4(rd) + stw r7,8(rd) + stw r8,12(rd) + addi rd,rd,16 + +Lmds1: + bf 28,Lmds2 // test 0x08 bit + lwz r0,0(rs) + lwz r6,4(rs) + addi rs,rs,8 + stw r0,0(rd) + stw r6,4(rd) + addi rd,rd,8 + +Lmds2: + bf 29,Lmds3 // test 0x4 bit + lwz r0,0(rs) + addi rs,rs,4 + stw r0,0(rd) + addi rd,rd,4 + +Lmds3: + bf 30,Lmds4 // test 0x2 bit + lhz r0,0(rs) + addi rs,rs,2 + sth r0,0(rd) + addi rd,rd,2 + +Lmds4: + bflr 31 // test 0x1 bit, return if 0 + lbz r0,0(rs) + addi rs,rs,1 + stb r0,0(rd) + addi rd,rd,1 + blr + + +// *********************** +// * M o v e U p L o n g * +// *********************** +// +// Subroutine to move 32-byte chunks of memory up (ie, right-to-left.) +// The destination is known to be 32-byte aligned, but the source is +// *not* necessarily aligned. +// +// Entry conditions: rs = last byte moved from source (right-to-left) +// rd = last byte moved into destination +// rc = count of bytes to move +// cr = crCached set iff destination is cacheable +// +// Exit conditions: rs = updated source ptr +// rd = updated destination ptr +// rc = low order 8 bits of count of bytes to move +// +// Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7. +// + +LMoveUpLong: + srwi. r11,rc,5 // r11 <- #32 byte chunks to move + mtctr r11 // prepare loop count + beqlr // return if no chunks to move + andi. r0,rs,7 // is source at least doubleword aligned? + beq Lmup3 // yes, can optimize this case + mtcrf 0x1,rc // save low bits of count + mtcrf 0x2,rc // (one cr at a time, as 604 prefers) + +Lmup1: // loop over each 32-byte-chunk + lwzu r0,-32(rs) + subi rd,rd,32 // prepare destination address for 'dcbz' + lwz r5,4(rs) + lwz r6,8(rs) + lwz r7,12(rs) + lwz r8,16(rs) + lwz r9,20(rs) + lwz r10,24(rs) + lwz r11,28(rs) + stw r0,0(rd) + stw r5,4(rd) + stw r6,8(rd) + stw r7,12(rd) + stw r8,16(rd) + stw r9,20(rd) + stw r10,24(rd) + stw r11,28(rd) + bdnz Lmup1 + mfcr rc // restore low bits of count + blr // return to caller + +// Aligned operands, so use d.p. floating point registers to move data. + +Lmup3: + lfdu f0,-32(rs) + subi rd,rd,32 // prepare destination address for 'dcbz' + lfd f1,8(rs) + lfd f2,16(rs) + lfd f3,24(rs) + stfd f0,0(rd) + stfd f1,8(rd) + stfd f2,16(rd) + stfd f3,24(rd) + bdnz Lmup3 + blr // return to caller + + +// *************************** +// * M o v e D o w n L o n g * +// *************************** +// +// Subroutine to move 32-byte chunks of memory down (ie, left-to-right.) +// The destination is known to be 32-byte aligned, but the source is +// *not* necessarily aligned. +// +// Entry conditions: rs = source ptr (next byte to move) +// rd = dest ptr (next byte to move into) +// rc = count of bytes to move +// cr = crCached set iff destination is cacheable +// +// Exit conditions: rs = updated source ptr +// rd = updated destination ptr +// rc = low order 8 bits of count of bytes to move +// +// Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7. +// + +LMoveDownLong: + srwi. r11,rc,5 // r11 <- #32 byte chunks to move + mtctr r11 // prepare loop count + beqlr // return if no chunks to move + andi. r0,rs,7 // is source at least doubleword aligned? + beq Lmdown3 // yes, can optimize this case + mtcrf 0x1,rc // save low 8 bits of count + mtcrf 0x2,rc // (one cr at a time, as 604 prefers) + +Lmdown1: // loop over each 32-byte-chunk + lwz r0,0(rs) + lwz r5,4(rs) + lwz r6,8(rs) + lwz r7,12(rs) + lwz r8,16(rs) + lwz r9,20(rs) + lwz r10,24(rs) + lwz r11,28(rs) + stw r0,0(rd) + stw r5,4(rd) + stw r6,8(rd) + stw r7,12(rd) + stw r8,16(rd) + stw r9,20(rd) + addi rs,rs,32 + stw r10,24(rd) + stw r11,28(rd) + addi rd,rd,32 + bdnz Lmdown1 + mfcr rc // restore low bits of count + blr // return to caller + +// Aligned operands, so use d.p. floating point registers to move data. + +Lmdown3: + lfd f0,0(rs) + lfd f1,8(rs) + lfd f2,16(rs) + lfd f3,24(rs) + addi rs,rs,32 + stfd f0,0(rd) + stfd f1,8(rd) + stfd f2,16(rd) + stfd f3,24(rd) + addi rd,rd,32 + bdnz Lmdown3 + blr // return to caller + +// +// Register use conventions are as follows: +// +// r0 - temp +// r6 - copy of VMX SPR at entry +// r7 - temp +// r8 - constant -1 (also temp and a string op buffer) +// r9 - constant 16 or -17 (also temp and a string op buffer) +// r10- constant 32 or -33 (also temp and a string op buffer) +// r11- constant 48 or -49 (also temp and a string op buffer) +// r12- chunk count ("c") in long moves +// +// v0 - vp - permute vector +// v1 - va - 1st quadword of source +// v2 - vb - 2nd quadword of source +// v3 - vc - 3rd quadword of source +// v4 - vd - 4th quadword of source +// v5 - vx - temp +// v6 - vy - temp +// v7 - vz - temp + +#define vp v0 +#define va v1 +#define vb v2 +#define vc v3 +#define vd v4 +#define vx v5 +#define vy v6 +#define vz v7 + +#define VRSave 256 + +// kShort should be the crossover point where the long algorithm is faster than the short. +// WARNING: kShort must be >= 64 + +// Yes, I know, we just checked rc > 128 to get here... + +#define kShort 128 +LAltivec: + cmpwi cr1,rc,kShort //(1) too short to bother using vector regs? + sub. r0,rd,rs //(1) must move reverse if (rd-rs)<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> +// <><> S H O R T O P E R A N D S <><> +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> + +LAlignedLoop: // word aligned operands (the common case) + lfd f0,0(rs) //(1) + lfd f1,8(rs) //(2) + addi rs,rs,16 //(2) + stfd f0,0(rd) //(3) + stfd f1,8(rd) //(4) + addi rd,rd,16 //(4) + bdnz LAlignedLoop //(4) + +Leftovers: + beqlr- cr7 //(8) done if r7==0, ie no leftover bytes + mtxer r7 //(9) count of bytes to move (1-15) + lswx r8,0,rs + stswx r8,0,rd + blr //(17) + +LUnalignedLoop: // not word aligned, cannot use lfd/stfd + lwz r8,0(rs) //(1) + lwz r9,4(rs) //(2) + lwz r10,8(rs) //(3) + lwz r11,12(rs) //(4) + addi rs,rs,16 //(4) + stw r8,0(rd) //(5) + stw r9,4(rd) //(6) + stw r10,8(rd) //(7) + stw r11,12(rd) //(8) + addi rd,rd,16 //(8) + bdnz LUnalignedLoop //(8) + + b Leftovers + + +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> +// <><> S H O R T R E V E R S E M O V E S <><> +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> + + // cr0 & r9 <- #doublewords to move (>=0) + // cr1 <- beq if word aligned + // r7 <- #leftover bytes to move (0-15) + +LShortReverse: + cmpwi cr7,r7,0 // leftover bytes? + add rs,rs,rc // point 1 past end of string for reverse moves + add rd,rd,rc + beq- LeftoversReverse // r9==0, ie no words to move + mtctr r9 // set up for quadword loop + bne- cr1,LUnalignedLoopReverse + +LAlignedLoopReverse: // word aligned, so use lfd/stfd + lfd f0,-8(rs) + lfdu f1,-16(rs) + stfd f0,-8(rd) + stfdu f1,-16(rd) + bdnz LAlignedLoopReverse + +LeftoversReverse: + beqlr- cr7 // done if r7==0, ie no leftover bytes + mtxer r7 // count of bytes to move (1-15) + neg r7,r7 // index back by #bytes + lswx r8,r7,rs + stswx r8,r7,rd + blr + +LUnalignedLoopReverse: // not word aligned, cannot use lfd/stfd + lwz r8,-4(rs) + lwz r9,-8(rs) + lwz r10,-12(rs) + lwzu r11,-16(rs) + stw r8,-4(rd) + stw r9,-8(rd) + stw r10,-12(rd) + stwu r11,-16(rd) + bdnz LUnalignedLoopReverse + + b LeftoversReverse + +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> +// <><> L O N G O P E R A N D S <><> +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> + + // cr6 set (blt) if must move reverse + // r0 <- (rd - rs) + +LMoveLong: + mfspr r6,VRSave //(5) save caller's VMX mask register + stw r6,-4(r1) // use CR save area so we can use r6 later + neg r8,rd //(5) start to compute #bytes to fill in 1st dest quadword + rlwinm r0,r0,0,28,31 //(6) start to determine relative alignment + andi. r7,r8,0xF //(6) r7 <- #bytes to fill in 1st dest quadword + cmpwi cr7,r0,0 //(7) relatively aligned? (ie, 16 bytes apart?) + oris r9,r6,0xFF00 //(7) light bits for regs we use (v0-v7) + mtspr VRSave,r9 //(8) update live register bitmask + blt- cr6,LongReverse //(8) must move reverse direction + sub rc,rc,r7 //(9) adjust length while we wait + beq- LDest16Aligned //(9) r7==0, ie destination already quadword aligned + + // Align destination on a quadword. + + mtxer r7 //(10) set up byte count (1-15) + lswx r8,0,rs // load into r8-r11 + stswx r8,0,rd // store r8-r11 (measured latency on arthur is 7.2 cycles) + add rd,rd,r7 //(18) adjust ptrs + add rs,rs,r7 //(18) + + // Begin preparation for inner loop and "dst" stream. + +LDest16Aligned: + andi. r0,rd,0x10 //(19) is destination cache-block aligned? + li r9,16 //(19) r9 <- constant used to access 2nd quadword + li r10,32 //(20) r10<- constant used to access 3rd quadword + beq- cr7,LAligned //(20) handle relatively aligned operands + lvx va,0,rs //(20) prefetch 1st source quadword + li r11,48 //(21) r11<- constant used to access 4th quadword + lvsl vp,0,rs //(21) get permute vector to left shift + beq LDest32Aligned //(22) destination already cache-block aligned + + // Copy 16 bytes to align destination on 32-byte (cache block) boundary + // to maximize store gathering. + + lvx vb,r9,rs //(23) get 2nd source qw + subi rc,rc,16 //(23) adjust count + addi rs,rs,16 //(24) adjust source ptr + vperm vx,va,vb,vp //(25) vx <- 1st destination qw + vor va,vb,vb //(25) va <- vb + stvx vx,0,rd //(26) assuming store Q deep enough to avoid latency + addi rd,rd,16 //(26) adjust dest ptr + + // Destination 32-byte aligned, source alignment unknown. + +LDest32Aligned: + srwi. r12,rc,6 //(27) r12<- count of 64-byte chunks to move + rlwinm r7,rc,28,30,31 //(27) r7 <- count of 16-byte chunks to move + cmpwi cr1,r7,0 //(28) remember if any 16-byte chunks + rlwinm r8,r12,0,26,31 //(29) mask chunk count down to 0-63 + subi r0,r8,1 //(30) r8==0? + beq- LNoChunks //(30) r12==0, ie no chunks to move + rlwimi r8,r0,0,25,25 //(31) if r8==0, then r8 <- 64 + li r0,64 //(31) r0 <- used to get 1st quadword of next chunk + sub. r12,r12,r8 //(32) adjust chunk count, set cr0 + mtctr r8 //(32) set up loop count + li r8,96 //SKP + li r6,128 //SKP + // Inner loop for unaligned sources. We copy 64 bytes per iteration. + // We loop at most 64 times, then reprime the "dst" and loop again for + // the next 4KB. This loop is tuned to keep the CPU flat out, which + // means we need to execute a lvx or stvx every cycle. + +LoopBy64: + dcbt rs,r8 //SKP + dcbt rs,r6 //SKP + lvx vb,r9,rs //(1) 2nd source quadword (1st already in va) + lvx vc,r10,rs //(2) 3rd + lvx vd,r11,rs //(3) 4th + vperm vx,va,vb,vp //(3) vx <- 1st destination quadword + lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (r0 must be RB!) + vperm vy,vb,vc,vp //(4) vy <- 2nd dest qw + stvx vx,0,rd //(5) + vperm vz,vc,vd,vp //(5) vz <- 3rd dest qw + stvx vy,r9,rd //(6) + vperm vx,vd,va,vp //(6) vx <- 4th + stvx vz,r10,rd //(7) + addi rs,rs,64 //(7) + stvx vx,r11,rd //(8) + addi rd,rd,64 //(8) + bdnz LoopBy64 //(8) + + // End of inner loop. Should we reprime dst stream and restart loop? + // This block is only executed when we're moving more than 4KB. + // It is usually folded out because cr0 is set in the loop prologue. + + beq+ LNoChunks // r12==0, ie no more chunks to move + sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer + mtctr r0 // initialize loop count to 64 + b LoopBy64 // restart inner loop, xfer another 4KB + + // Fewer than 64 bytes remain to be moved. + +LNoChunks: // r7 and cr1 are set with the number of QWs + andi. rc,rc,0xF //(33) rc <- leftover bytes + beq- cr1,LCleanup //(33) r7==0, ie fewer than 16 bytes remaining + mtctr r7 //(34) we will loop over 1-3 QWs + +LoopBy16: + lvx vb,r9,rs //(1) vb <- 2nd source quadword + addi rs,rs,16 //(1) + vperm vx,va,vb,vp //(3) vx <- next destination quadword + vor va,vb,vb //(3) va <- vb + stvx vx,0,rd //(4) assuming store Q is deep enough to mask latency + addi rd,rd,16 //(4) + bdnz LoopBy16 //(4) + + // Move remaining bytes in last quadword. rc and cr0 have the count. + +LCleanup: + lwz r6,-4(r1) // load VRSave from CR save area + mtspr VRSave,r6 //(35) restore caller's live-register bitmask + beqlr //(36) rc==0, ie no leftovers, so done + mtxer rc //(37) load byte count (1-15) + lswx r8,0,rs + stswx r8,0,rd + blr //(45) + + +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> +// <><> L O N G A L I G N E D M O V E S <><> +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> + + // rs, rd <- both quadword aligned + // cr0 <- beq if dest is cache block (32-byte) aligned + // r9 <- 16 + // r10 <- 32 + +LAligned: + lvx va,0,rs // prefetch 1st source quadword + li r11,48 // r11<- constant used to access 4th quadword + beq LAligned32 // destination already cache-block aligned + + // Copy 16 bytes to align destination on 32-byte (cache block) boundary + // to maximize store gathering. + + subi rc,rc,16 // adjust count + addi rs,rs,16 // adjust source ptr + stvx va,0,rd // assuming store Q deep enough to avoid latency + addi rd,rd,16 // adjust dest ptr + + // Destination 32-byte aligned, source 16-byte aligned. Set up for inner loop. + +LAligned32: + srwi. r12,rc,6 // r12<- count of 64-byte chunks to move + rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move + cmpwi cr1,r7,0 // remember if any 16-byte chunks + rlwinm r8,r12,0,26,31 // mask chunk count down to 0-63 + subi r0,r8,1 // r8==0? + beq- LAlignedNoChunks // r12==0, ie no chunks to move + rlwimi r8,r0,0,25,25 // if r8==0, then r8 <- 64 + li r0,64 // r0 <- used at end of loop + sub. r12,r12,r8 // adjust chunk count, set cr0 + mtctr r8 // set up loop count + li r8,96 //SKP + li r6,128 //SKP + + // Inner loop for aligned sources. We copy 64 bytes per iteration. + +LAlignedLoopBy64: + dcbt rs,r8 //SKP + dcbt rs,r6 //SKP + lvx va,0,rs //(1) + lvx vb,r9,rs //(2) + lvx vc,r10,rs //(3) + lvx vd,r11,rs //(4) + addi rs,rs,64 //(4) + stvx va,0,rd //(5) + stvx vb,r9,rd //(6) + stvx vc,r10,rd //(7) + stvx vd,r11,rd //(8) + addi rd,rd,64 //(8) + bdnz LAlignedLoopBy64 //(8) + + // End of inner loop. Loop again for next 4KB iff any. + + beq+ LAlignedNoChunks // r12==0, ie no more chunks to move + sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer + mtctr r0 // reinitialize loop count to 64 + b LAlignedLoopBy64 // restart inner loop, xfer another 4KB + + // Fewer than 64 bytes remain to be moved. + +LAlignedNoChunks: // r7 and cr1 are set with the number of QWs + andi. rc,rc,0xF // rc <- leftover bytes + beq- cr1,LCleanup // r7==0, ie fewer than 16 bytes remaining + mtctr r7 // we will loop over 1-3 QWs + +LAlignedLoopBy16: + lvx va,0,rs // get next quadword + addi rs,rs,16 + stvx va,0,rd + addi rd,rd,16 + bdnz LAlignedLoopBy16 + + b LCleanup // handle last 0-15 bytes, if any + + +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> +// <><> L O N G R E V E R S E M O V E S <><> +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> + + // Reverse moves. These involve overlapping operands, with the source + // lower in memory (lower addresses) than the destination. They must be + // done right-to-left, ie from high addresses down to low addresses. + // Throughout this code, we maintain rs and rd as pointers one byte past + // the end of the untransferred operands. + // + // The byte count is >=kShort and the following registers are already loaded: + // + // r6 - VMX mask at entry + // cr7 - beq if relatively aligned + // + +LongReverse: + add rd,rd,rc // update source/dest ptrs to be 1 byte past end + add rs,rs,rc + andi. r7,rd,0xF // r7 <- #bytes needed to move to align destination + sub rc,rc,r7 // adjust length while we wait + sub rs,rs,r7 // adjust ptrs by #bytes to xfer, also while we wait + sub rd,rd,r7 + beq- LDest16AlignedReverse + + // Align destination on a quadword. Note that we do NOT align on a cache + // block boundary for store gathering etc// since all these operands overlap + // many dest cache blocks will already be in the L1, so its not clear that + // this would be a win. + + mtxer r7 // load byte count + lswx r8,0,rs + stswx r8,0,rd + + // Prepare for inner loop and start "dstst" stream. Frankly, its not + // clear whether "dst" or "dstst" would be better// somebody should + // measure. We use "dstst" because, being overlapped, at least some + // source cache blocks will also be stored into. + +LDest16AlignedReverse: + srwi. r12,rc,6 // r12 <- count of 64-byte chunks to move + rlwinm r0,rc,11,9,15 // position quadword count for dst + rlwinm r11,r12,0,26,31 // mask chunk count down to 0-63 + li r9,-17 // r9 <- constant used to access 2nd quadword + oris r0,r0,0x0100 // set dst block size to 1 qw + li r10,-33 // r10<- constant used to access 3rd quadword + ori r0,r0,0xFFE0 // set dst stride to -16 bytes + li r8,-1 // r8<- constant used to access 1st quadword + dstst rs,r0,3 // start stream 0 + subi r0,r11,1 // r11==0 ? + lvx va,r8,rs // prefetch 1st source quadword + rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move + lvsl vp,0,rs // get permute vector to right shift + cmpwi cr1,r7,0 // remember if any 16-byte chunks + beq- LNoChunksReverse // r12==0, so skip inner loop + rlwimi r11,r0,0,25,25 // if r11==0, then r11 <- 64 + sub. r12,r12,r11 // adjust chunk count, set cr0 + mtctr r11 // set up loop count + li r11,-49 // r11<- constant used to access 4th quadword + li r0,-64 // r0 <- used for several purposes + beq- cr7,LAlignedLoopBy64Reverse + + // Inner loop for unaligned sources. We copy 64 bytes per iteration. + +LoopBy64Reverse: + lvx vb,r9,rs //(1) 2nd source quadword (1st already in va) + lvx vc,r10,rs //(2) 3rd quadword + lvx vd,r11,rs //(3) 4th + vperm vx,vb,va,vp //(3) vx <- 1st destination quadword + lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (note r0 must be RB) + vperm vy,vc,vb,vp //(4) vy <- 2nd dest qw + stvx vx,r8,rd //(5) + vperm vz,vd,vc,vp //(5) vz <- 3rd destination quadword + stvx vy,r9,rd //(6) + vperm vx,va,vd,vp //(6) vx <- 4th qw + stvx vz,r10,rd //(7) + subi rs,rs,64 //(7) + stvx vx,r11,rd //(8) + subi rd,rd,64 //(8) + bdnz LoopBy64Reverse //(8) + + // End of inner loop. Should we reprime dst stream and restart loop? + // This block is only executed when we're moving more than 4KB. + // It is usually folded out because cr0 is set in the loop prologue. + + beq+ LNoChunksReverse // r12==0, ie no more chunks to move + lis r8,0x0440 // dst control: 64 4-qw blocks + add. r12,r12,r0 // set cr0 if more than 4KB remain to xfer + ori r8,r8,0xFFC0 // stride is -64 bytes + dstst rs,r8,3 // restart the prefetch stream + li r8,64 // inner loop count + mtctr r8 // initialize loop count to 64 + li r8,-1 // restore qw1 offset for inner loop + b LoopBy64Reverse // restart inner loop, xfer another 4KB + + // Fewer than 64 bytes remain to be moved. + +LNoChunksReverse: // r7 and cr1 are set with the number of QWs + andi. rc,rc,0xF // rc <- leftover bytes + beq- cr1,LCleanupReverse // r7==0, ie fewer than 16 bytes left + mtctr r7 + beq- cr7,LAlignedLoopBy16Reverse + +LoopBy16Reverse: + lvx vb,r9,rs // vb <- 2nd source quadword + subi rs,rs,16 + vperm vx,vb,va,vp // vx <- next destination quadword + vor va,vb,vb // va <- vb + stvx vx,r8,rd + subi rd,rd,16 + bdnz LoopBy16Reverse + + // Fewer that 16 bytes remain to be moved. + +LCleanupReverse: // rc and cr0 set with remaining byte count + lwz r6,-4(r1) // load VRSave from CR save area + mtspr VRSave,r6 // restore caller's live-register bitmask + beqlr // rc==0, ie no leftovers so done + neg r7,rc // get -(#bytes) + mtxer rc // byte count + lswx r8,r7,rs + stswx r8,r7,rd + blr + + +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> +// <><> A L I G N E D L O N G R E V E R S E M O V E S <><> +// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> + + // Inner loop. We copy 64 bytes per iteration. + +LAlignedLoopBy64Reverse: + lvx va,r8,rs //(1) + lvx vb,r9,rs //(2) + lvx vc,r10,rs //(3) + lvx vd,r11,rs //(4) + subi rs,rs,64 //(4) + stvx va,r8,rd //(5) + stvx vb,r9,rd //(6) + stvx vc,r10,rd //(7) + stvx vd,r11,rd //(8) + subi rd,rd,64 //(8) + bdnz LAlignedLoopBy64Reverse //(8) + + // End of inner loop. Loop for next 4KB iff any. + + beq+ LNoChunksReverse // r12==0, ie no more chunks to move + lis r8,0x0440 // dst control: 64 4-qw blocks + add. r12,r12,r0 // r12 <- r12 - 64, set cr0 + ori r8,r8,0xFFC0 // stride is -64 bytes + dstst rs,r8,3 // restart the prefetch stream + li r8,64 // inner loop count + mtctr r8 // initialize loop count to 64 + li r8,-1 // restore qw1 offset for inner loop + b LAlignedLoopBy64Reverse + + // Loop to copy leftover quadwords (1-3). + +LAlignedLoopBy16Reverse: + lvx va,r8,rs // get next qw + subi rs,rs,16 + stvx va,r8,rd + subi rd,rd,16 + bdnz LAlignedLoopBy16Reverse + + b LCleanupReverse // handle up to 15 bytes in last qw diff --git a/gen.subproj/ppc.subproj/memcpy.s b/gen.subproj/ppc.subproj/memcpy.s deleted file mode 100644 index 0c371f6..0000000 --- a/gen.subproj/ppc.subproj/memcpy.s +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -#define MEMCPY -#include "bcopy.s" diff --git a/gen.subproj/ppc.subproj/memmove.s b/gen.subproj/ppc.subproj/memmove.s deleted file mode 100644 index d517786..0000000 --- a/gen.subproj/ppc.subproj/memmove.s +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -#define MEMMOVE -#include "bcopy.s" diff --git a/gen.subproj/scalable_malloc.c b/gen.subproj/scalable_malloc.c index cbf6ab8..a19c7a4 100644 --- a/gen.subproj/scalable_malloc.c +++ b/gen.subproj/scalable_malloc.c @@ -76,7 +76,8 @@ If 0 then the block is either free (in which case the size is directly at the bl #define PROTECT_SMALL 0 // Should be 0: 1 is too slow for normal use -#define LARGE_CACHE_SIZE 4 // define hysterisis of large chunks +#define LARGE_CACHE_SIZE 1 // define hysterisis of large chunks +#define MAX_LARGE_SIZE_TO_CACHE (128*1024) /* blocks larger than this are not cached */ #define MAX_RECORDER_BUFFER 256 @@ -149,6 +150,7 @@ static size_t szone_good_size(szone_t *szone, size_t size); static boolean_t szone_check_all(szone_t *szone, const char *function); static void szone_print(szone_t *szone, boolean_t verbose); static INLINE region_t *region_for_ptr_no_lock(szone_t *szone, const void *ptr); +static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry); #define LOG(szone,ptr) (szone->log_address && (szone->num_small_objects > 8) && (((unsigned)szone->log_address == -1) || (szone->log_address == (void *)(ptr)))) @@ -931,11 +933,9 @@ static void large_entries_grow_no_lock(szone_t *szone) { } static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry) { - // enters the specified large entry into the cache of freed entries - // returns a range to truly deallocate - vm_range_t vm_range_to_deallocate; + // frees the specific entry in the size table + // returns a range to truly deallocate, taking into account vm_range_t range; - vm_range_t *range_to_use; range.address = LARGE_ENTRY_ADDRESS(*entry); range.size = LARGE_ENTRY_SIZE(*entry); szone->num_large_objects_in_use --; @@ -956,6 +956,18 @@ static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry) { sleep(3600); } #endif + return range; +} + +static vm_range_t large_find_better_range_to_deallocate(szone_t *szone, vm_range_t range) { + // enters the specified large entry into the cache of freed entries + // returns a range to truly deallocate + vm_range_t *range_to_use; + vm_range_t vm_range_to_deallocate; + + // if the specified range in larger than MAX_LARGE_SIZE_TO_CACHE the range is not cached + if (range.size > MAX_LARGE_SIZE_TO_CACHE) return range; + range = coalesce_range(szone->large_to_deallocate, LARGE_CACHE_SIZE, range); range_to_use = first_zero_range(szone->large_to_deallocate, LARGE_CACHE_SIZE); if (range_to_use) { @@ -1185,6 +1197,7 @@ static void szone_free(szone_t *szone, void *ptr) { vm_msync(mach_task_self(), LARGE_ENTRY_ADDRESS(*entry), LARGE_ENTRY_SIZE(*entry), VM_SYNC_KILLPAGES); } vm_range_to_deallocate = large_free_no_lock(szone, entry); + vm_range_to_deallocate = large_find_better_range_to_deallocate(szone, vm_range_to_deallocate); #if DEBUG_MALLOC if (large_entry_for_pointer_no_lock(szone, ptr)) { malloc_printf("*** malloc[%d]: Just after freeing 0x%x still in use num_large_entries=%d\n", getpid(), ptr, szone->num_large_entries); @@ -1386,12 +1399,27 @@ static void *szone_realloc(szone_t *szone, void *ptr, size_t new_size) { if (szone_try_realloc_in_place(szone, ptr, old_size, new_size)) return ptr; } newPtr = szone_malloc(szone, new_size); - if (old_size > VM_COPY_THRESHOLD) { + if ((old_size > VM_COPY_THRESHOLD) && (old_size < (1 << (vm_page_shift + vm_page_shift)))) { + // we know it's a large block, and not a huge block kern_return_t err = 0; err = vm_copy(mach_task_self(), (vm_address_t)ptr, old_size, (vm_address_t)newPtr); if (err) { szone_error(szone, "Can't vm_copy region", ptr); - } + } else { + large_entry_t *entry; + vm_range_t range; + SZONE_LOCK(szone); + entry = large_entry_for_pointer_no_lock(szone, ptr); + if (!entry) { + szone_error(szone, "Can't find entry for large copied block", ptr); + } + range = large_free_no_lock(szone, entry); + SZONE_UNLOCK(szone); // we release the lock asap + // we truly deallocate_pages, including guard pages + deallocate_pages(szone, range.address, range.size, 0); + if (LOG(szone, ptr)) malloc_printf("szone_realloc returned %p for %d\n", newPtr, (unsigned)new_size); + return newPtr; + } } else { memcpy(newPtr, ptr, old_size); } diff --git a/locale.subproj/rune.c b/locale.subproj/rune.c index 2325a7e..631c815 100644 --- a/locale.subproj/rune.c +++ b/locale.subproj/rune.c @@ -92,7 +92,7 @@ setrunelocale(encoding) return(0); } - if (!PathLocale && !(PathLocale = getenv("PATH_LOCALE"))) + if (!PathLocale) PathLocale = _PATH_LOCALE; sprintf(name, "%s/%s/LC_CTYPE", PathLocale, encoding); diff --git a/locale.subproj/setlocale.c b/locale.subproj/setlocale.c index 8011e68..7dc8b93 100644 --- a/locale.subproj/setlocale.c +++ b/locale.subproj/setlocale.c @@ -105,7 +105,7 @@ setlocale(category, locale) int found, i, len; char *env, *r; - if (!PathLocale && !(PathLocale = getenv("PATH_LOCALE"))) + if (!PathLocale) PathLocale = _PATH_LOCALE; if (category < 0 || category >= _LC_LAST) diff --git a/mach.subproj/mach_init.c b/mach.subproj/mach_init.c index 6fac3a0..2d3bd9c 100644 --- a/mach.subproj/mach_init.c +++ b/mach.subproj/mach_init.c @@ -123,7 +123,7 @@ int mach_init_doit(int forkchild) _atfork_child_routine = mach_atfork_child_routine; _pthread_set_self(0); cthread_set_self(0); - } + } /* * Initialize the single mig reply port @@ -209,11 +209,11 @@ int fork_mach_init() mach_port_t mach_task_self() { - return(mach_task_self_); + return(task_self_trap()); } mach_port_t mach_thread_self() { return(thread_self_trap()); -} \ No newline at end of file +} diff --git a/pthreads.subproj/pthread.c b/pthreads.subproj/pthread.c index ddf28a8..3a927bd 100644 --- a/pthreads.subproj/pthread.c +++ b/pthreads.subproj/pthread.c @@ -55,8 +55,10 @@ extern pthread_lock_t reply_port_lock; */ size_t _pthread_stack_size = 0; -int _spin_tries = 1; +int _spin_tries = 0; +#if !defined(__ppc__) int _cpu_has_altivec = 0; +#endif /* This global should be used (carefully) by anyone needing to know if a pthread has been ** created. @@ -105,14 +107,6 @@ extern mach_port_t thread_recycle_port; #endif -/* This is the struct used to recycle (or terminate) a thread */ -/* We stash the thread port into the reply port of the message */ - -typedef struct { - mach_msg_header_t header; - mach_msg_trailer_t trailer; -} recycle_msg_t; - /* Set the base address to use as the stack pointer, before adjusting due to the ABI */ static int @@ -514,12 +508,6 @@ pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize) } } -pthread_t _cachedThread = (pthread_t)0; - -void _clear_thread_cache(void) { - _cachedThread = (pthread_t)0; -} - /* * Create and start execution of a new thread. */ @@ -527,7 +515,6 @@ void _clear_thread_cache(void) { static void _pthread_body(pthread_t self) { - _clear_thread_cache(); _pthread_set_self(self); pthread_exit((self->fun)(self->arg)); } @@ -721,9 +708,9 @@ pthread_detach(pthread_t thread) thread->death = MACH_PORT_NULL; UNLOCK(thread->lock); if (num_joiners > 0) - { /* Have to tell these guys this thread can't be joined with */ - swtch_pri(0); - PTHREAD_MACH_CALL(semaphore_signal_all(thread->joiners), kern_res); + { + /* Wake up a joiner */ + PTHREAD_MACH_CALL(semaphore_signal(thread->joiners), kern_res); } /* Destroy 'control' semaphores */ PTHREAD_MACH_CALL(semaphore_destroy(mach_task_self(), @@ -731,6 +718,10 @@ pthread_detach(pthread_t thread) PTHREAD_MACH_CALL(semaphore_destroy(mach_task_self(), death), kern_res); return (ESUCCESS); + } else if (thread->detached == _PTHREAD_EXITED) { + UNLOCK(thread->lock); + pthread_join(thread, NULL); + return ESUCCESS; } else { UNLOCK(thread->lock); @@ -748,16 +739,20 @@ pthread_detach(pthread_t thread) /* terminated, it will be yanked out from under the mach_msg() call. */ static void _pthread_become_available(pthread_t thread) { - recycle_msg_t msg = { { 0 } }; + mach_msg_empty_rcv_t msg = { { 0 } }; kern_return_t ret; + if (thread->reply_port == MACH_PORT_NULL) { + thread->reply_port = mach_reply_port(); + } msg.header.msgh_size = sizeof msg - sizeof msg.trailer; msg.header.msgh_remote_port = thread_recycle_port; msg.header.msgh_local_port = MACH_PORT_NULL; msg.header.msgh_id = (int)thread; msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0); - ret = mach_msg(&msg.header, MACH_SEND_MSG, msg.header.msgh_size, 0, - MACH_PORT_NULL, MACH_MSG_TIMEOUT_NONE, + ret = mach_msg(&msg.header, MACH_SEND_MSG | MACH_RCV_MSG, + msg.header.msgh_size, sizeof msg, + thread->reply_port, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL); while (1) { ret = thread_suspend(thread->kernel_thread); @@ -767,17 +762,17 @@ static void _pthread_become_available(pthread_t thread) { /* Check to see if any threads are available. Return immediately */ -static kern_return_t _pthread_check_for_available_threads(recycle_msg_t *msg) { +static kern_return_t _pthread_check_for_available_threads(mach_msg_empty_rcv_t *msg) { return mach_msg(&msg->header, MACH_RCV_MSG|MACH_RCV_TIMEOUT, 0, - sizeof(recycle_msg_t), thread_recycle_port, 0, + sizeof(mach_msg_empty_rcv_t), thread_recycle_port, 0, MACH_PORT_NULL); } /* Terminate all available threads and deallocate their stacks */ static void _pthread_reap_threads(void) { kern_return_t ret; - recycle_msg_t msg = { { 0 } }; - while(_pthread_check_for_available_threads(&msg) == KERN_SUCCESS) { + mach_msg_empty_rcv_t msg = { { 0 } }; + while((ret = _pthread_check_for_available_threads(&msg)) == KERN_SUCCESS) { pthread_t th = (pthread_t)msg.header.msgh_id; mach_port_t kernel_thread = th->kernel_thread; mach_port_t reply_port = th->reply_port; @@ -807,31 +802,14 @@ static void _pthread_reap_threads(void) { } free(th); } + assert(ret == MACH_RCV_TIMED_OUT); } - -static void * -stackAddress(void) -{ - unsigned dummy; - return (void *)((unsigned)&dummy & ~ (PTHREAD_STACK_MIN - 1)); -} - -extern pthread_t _pthread_self(void); +/* For compatibility... */ pthread_t -pthread_self(void) -{ - void * myStack = (void *)0; - pthread_t cachedThread = _cachedThread; - if (cachedThread) { - myStack = stackAddress(); - if ((void *)((unsigned)(cachedThread->stackaddr - 1) & ~ (PTHREAD_STACK_MIN - 1)) == myStack) { - return cachedThread; - } - } - _cachedThread = _pthread_self(); - return _cachedThread; +_pthread_self() { + return pthread_self(); } /* @@ -844,7 +822,6 @@ pthread_exit(void *value_ptr) struct _pthread_handler_rec *handler; kern_return_t kern_res; int num_joiners; - _clear_thread_cache(); while ((handler = self->cleanup_stack) != 0) { (handler->routine)(handler->arg); @@ -860,10 +837,14 @@ pthread_exit(void *value_ptr) UNLOCK(self->lock); if (num_joiners > 0) { - swtch_pri(0); - PTHREAD_MACH_CALL(semaphore_signal_all(self->joiners), kern_res); + /* POSIX says that multiple pthread_join() calls on */ + /* the same thread are undefined so we just wake up */ + /* the first one to join */ + PTHREAD_MACH_CALL(semaphore_signal(self->joiners), kern_res); } - PTHREAD_MACH_CALL(semaphore_wait(self->death), kern_res); + do { + PTHREAD_MACH_CALL(semaphore_wait(self->death), kern_res); + } while (kern_res == KERN_ABORTED); } else UNLOCK(self->lock); /* Destroy thread & reclaim resources */ @@ -896,7 +877,9 @@ pthread_join(pthread_t thread, { thread->num_joiners++; UNLOCK(thread->lock); - PTHREAD_MACH_CALL(semaphore_wait(thread->joiners), kern_res); + do { + PTHREAD_MACH_CALL(semaphore_wait(thread->joiners), kern_res); + } while (kern_res == KERN_ABORTED); LOCK(thread->lock); thread->num_joiners--; } @@ -909,7 +892,6 @@ pthread_join(pthread_t thread, *value_ptr = thread->exit_value; } UNLOCK(thread->lock); - swtch_pri(0); PTHREAD_MACH_CALL(semaphore_signal(thread->death), kern_res); return (ESUCCESS); } else @@ -1183,14 +1165,10 @@ pthread_init(void) } attrs = &_attr; pthread_attr_init(attrs); - _clear_thread_cache(); - _pthread_set_self(&_thread); + _pthread_set_self(&_thread); _pthread_create(&_thread, attrs, USRSTACK, mach_thread_self()); - thread = (pthread_t)malloc(sizeof(struct _pthread)); - memcpy(thread, &_thread, sizeof(struct _pthread)); - _clear_thread_cache(); - _pthread_set_self(thread); + thread = &_thread; thread->detached = _PTHREAD_CREATE_PARENT; /* See if we're on a multiprocessor and set _spin_tries if so. */ @@ -1199,7 +1177,7 @@ pthread_init(void) len = sizeof(numcpus); if (sysctl(mib, 2, &numcpus, &len, NULL, 0) == 0) { if (numcpus > 1) { - _spin_tries = SPIN_TRIES; + _spin_tries = MP_SPIN_TRIES; } } else { count = HOST_BASIC_INFO_COUNT; @@ -1210,7 +1188,7 @@ pthread_init(void) printf("host_info failed (%d)\n", kr); else { if (basic_info.avail_cpus > 1) - _spin_tries = SPIN_TRIES; + _spin_tries = MP_SPIN_TRIES; /* This is a crude test */ if (basic_info.cpu_subtype >= CPU_SUBTYPE_POWERPC_7400) _cpu_has_altivec = 1; diff --git a/pthreads.subproj/pthread_cond.c b/pthreads.subproj/pthread_cond.c index ae6ef05..4eee4a5 100644 --- a/pthreads.subproj/pthread_cond.c +++ b/pthreads.subproj/pthread_cond.c @@ -295,7 +295,8 @@ _pthread_cond_wait(pthread_cond_t *cond, if ((res = pthread_mutex_lock(mutex)) != ESUCCESS) { return (res); } - if (kern_res == KERN_SUCCESS) { + /* KERN_ABORTED can be treated as a spurious wakeup */ + if ((kern_res == KERN_SUCCESS) || (kern_res == KERN_ABORTED)) { return (ESUCCESS); } else if (kern_res == KERN_OPERATION_TIMED_OUT) { return (ETIMEDOUT); diff --git a/pthreads.subproj/pthread_internals.h b/pthreads.subproj/pthread_internals.h index 1e96b3c..2cfde61 100644 --- a/pthreads.subproj/pthread_internals.h +++ b/pthreads.subproj/pthread_internals.h @@ -198,24 +198,33 @@ extern boolean_t swtch_pri(int); /* Number of times to spin when the lock is unavailable and we are on a multiprocessor. On a uniprocessor we yield the processor immediately. */ -#define SPIN_TRIES 10 +#define MP_SPIN_TRIES 1000 extern int _spin_tries; extern int __is_threaded; extern int _cpu_has_altivec; /* Internal mutex locks for data structures */ -#define TRY_LOCK(v) (!__is_threaded || _spin_lock_try((pthread_lock_t *)&v)) -#if 0 -#define LOCK(v) if (__is_threaded) _spin_lock((pthread_lock_t)&v) -#else -#define LOCK(v) \ - if (__is_threaded) { \ - while (!_spin_lock_try((pthread_lock_t *)&v)) { \ - syscall_thread_switch(THREAD_NULL, SWITCH_OPTION_WAIT, 1); \ - } \ - } -#endif -#define UNLOCK(v) if (__is_threaded) _spin_unlock((pthread_lock_t *)&v) +#define TRY_LOCK(v) (!__is_threaded || _spin_lock_try((pthread_lock_t *)&(v))) +#define LOCK(v) \ +do { \ + if (__is_threaded) { \ + int tries = _spin_tries; \ + \ + while (!_spin_lock_try((pthread_lock_t *)&(v))) { \ + if (tries-- > 0) \ + continue; \ + \ + syscall_thread_switch(THREAD_NULL, SWITCH_OPTION_DEPRESS, 1); \ + tries = _spin_tries; \ + } \ + } \ +} while (0) +#define UNLOCK(v) \ +do { \ + if (__is_threaded) \ + _spin_unlock((pthread_lock_t *)&(v)); \ +} while (0) + #ifndef ESUCCESS #define ESUCCESS 0 #endif diff --git a/pthreads.subproj/pthread_mutex.c b/pthreads.subproj/pthread_mutex.c index 1276e60..427026a 100644 --- a/pthreads.subproj/pthread_mutex.c +++ b/pthreads.subproj/pthread_mutex.c @@ -141,7 +141,9 @@ pthread_mutex_lock(pthread_mutex_t *mutex) mutex->sem = new_sem_from_pool(); } UNLOCK(mutex->lock); - PTHREAD_MACH_CALL(semaphore_wait(mutex->sem), kern_res); + do { + PTHREAD_MACH_CALL(semaphore_wait(mutex->sem), kern_res); + } while (kern_res == KERN_ABORTED); LOCK(mutex->lock); mutex->waiters--; if (mutex->waiters == 0) { diff --git a/stdio.subproj/vfprintf.c b/stdio.subproj/vfprintf.c index f819e25..2e49b6b 100644 --- a/stdio.subproj/vfprintf.c +++ b/stdio.subproj/vfprintf.c @@ -276,7 +276,7 @@ __uqtoa(val, endp, base, octzero, xdigs) #define BUF (MAXEXP+MAXFRACT+1) /* + decimal point */ #define DEFPREC 6 -static char *cvt __P((double, int, int, char *, int *, int, int *)); +static char *cvt __P((double, int, int, char *, int *, int, int *, char **)); static int exponent __P((char *, int, int)); #else /* no FLOATING_POINT */ @@ -322,6 +322,7 @@ vfprintf(fp, fmt0, ap) int expsize = 0; /* character count for expstr */ int ndig; /* actual number of digits returned by cvt */ char expstr[7]; /* buffer for exponent string */ + char *dtoaresult; /* buffer allocated by dtoa */ #endif u_long ulval = 0; /* integer arguments %[diouxX] */ u_quad_t uqval = 0; /* %q integers */ @@ -428,8 +429,9 @@ vfprintf(fp, fmt0, ap) } else { \ val = GETARG (int); \ } - - +#ifdef FLOATING_POINT + dtoaresult = NULL; +#endif /* FLOCKFILE(fp); */ /* sorry, fprintf(read_only_file, "") returns EOF, not 0 */ if (cantwrite(fp)) { @@ -621,7 +623,7 @@ fp_begin: if (prec == -1) } flags |= FPT; cp = cvt(_double, prec, flags, &softsign, - &expt, ch, &ndig); + &expt, ch, &ndig, &dtoaresult); if (ch == 'g' || ch == 'G') { if (expt <= -4 || expt > prec) ch = (ch == 'g') ? 'e' : 'E'; @@ -877,6 +879,10 @@ number: if ((dprec = prec) >= 0) done: FLUSH(); error: +#ifdef FLOATING_POINT + if (dtoaresult != NULL) + free(dtoaresult); +#endif if (__sferror(fp)) ret = EOF; /* FUNLOCKFILE(fp); */ @@ -911,7 +917,7 @@ error: * Find all arguments when a positional parameter is encountered. Returns a * table, indexed by argument number, of pointers to each arguments. The * initial argument table should be an array of STATIC_ARG_TBL_SIZE entries. - * It will be replaces with a malloc-ed on if it overflows. + * It will be replaces with a malloc-ed one if it overflows. */ static void __find_arguments (fmt0, ap, argtable) @@ -937,8 +943,8 @@ __find_arguments (fmt0, ap, argtable) #define ADDTYPE(type) \ ((nextarg >= tablesize) ? \ __grow_type_table(nextarg, &typetable, &tablesize) : 0, \ - typetable[nextarg++] = type, \ - (nextarg > tablemax) ? tablemax = nextarg : 0) + (nextarg > tablemax) ? tablemax = nextarg : 0, \ + typetable[nextarg++] = type) #define ADDSARG() \ ((flags&LONGINT) ? ADDTYPE(T_LONG) : \ @@ -1191,33 +1197,38 @@ __grow_type_table (nextarg, typetable, tablesize) unsigned char **typetable; int *tablesize; { - unsigned char *oldtable = *typetable; - int newsize = *tablesize * 2; - - if (*tablesize == STATIC_ARG_TBL_SIZE) { - *typetable = (unsigned char *) - malloc (sizeof (unsigned char) * newsize); - bcopy (oldtable, *typetable, *tablesize); + unsigned char *const oldtable = *typetable; + const int oldsize = *tablesize; + unsigned char *newtable; + int newsize = oldsize * 2; + + if (newsize < nextarg + 1) + newsize = nextarg + 1; + if (oldsize == STATIC_ARG_TBL_SIZE) { + if ((newtable = malloc (newsize)) == NULL) + abort(); /* XXX handle better */ + bcopy (oldtable, newtable, oldsize); } else { - *typetable = (unsigned char *) - realloc (typetable, sizeof (unsigned char) * newsize); - + if ((newtable = realloc (oldtable, newsize)) == NULL) + abort(); /* XXX handle better */ } - memset (&typetable [*tablesize], T_UNUSED, (newsize - *tablesize)); + memset (&newtable [oldsize], T_UNUSED, (newsize - oldsize)); + *typetable = newtable; *tablesize = newsize; } #ifdef FLOATING_POINT -extern char *__dtoa __P((double, int, int, int *, int *, char **)); +extern char *__dtoa __P((double, int, int, int *, int *, char **, char **)); static char * -cvt(value, ndigits, flags, sign, decpt, ch, length) +cvt(value, ndigits, flags, sign, decpt, ch, length, dtoaresultp) double value; int ndigits, flags, *decpt, ch, *length; char *sign; + char **dtoaresultp; { int mode, dsgn; char *digits, *bp, *rve; @@ -1239,7 +1250,7 @@ cvt(value, ndigits, flags, sign, decpt, ch, length) *sign = '-'; } else *sign = '\000'; - digits = __dtoa(value, mode, ndigits, decpt, &dsgn, &rve); + digits = __dtoa(value, mode, ndigits, decpt, &dsgn, &rve, dtoaresultp); if ((ch != 'g' && ch != 'G') || flags & ALT) { /* print trailing zeros */ bp = digits + ndigits; diff --git a/stdio.subproj/vfscanf.c b/stdio.subproj/vfscanf.c index c245bfc..a12e167 100644 --- a/stdio.subproj/vfscanf.c +++ b/stdio.subproj/vfscanf.c @@ -80,6 +80,7 @@ #define SUPPRESS 0x08 /* suppress assignment */ #define POINTER 0x10 /* weird %p pointer (`fake hex') */ #define NOSKIP 0x20 /* do not skip blanks */ +#define QUAD 0x400 /* * The following are used in numeric conversions only: @@ -101,13 +102,13 @@ #define CT_CHAR 0 /* %c conversion */ #define CT_CCL 1 /* %[...] conversion */ #define CT_STRING 2 /* %s conversion */ -#define CT_INT 3 /* integer, i.e., strtol or strtoul */ +#define CT_INT 3 /* integer, i.e., strtoq or strtouq */ #define CT_FLOAT 4 /* floating, i.e., strtod */ #define u_char unsigned char #define u_long unsigned long -static u_char *__sccl(); +static u_char *__sccl(char *, u_char *); /* * vfscanf @@ -127,8 +128,8 @@ __svfscanf(fp, fmt0, ap) register char *p0; /* saves original value of p when necessary */ int nassigned; /* number of fields assigned */ int nread; /* number of characters consumed from fp */ - int base; /* base argument to strtol/strtoul */ - u_long (*ccfn)(); /* conversion function (strtol/strtoul) */ + int base; /* base argument to strtoq/strtouq */ + u_quad_t (*ccfn)(); /* conversion function (strtoq/strtouq) */ char ccltab[256]; /* character class table for %[...] */ char buf[BUF]; /* buffer for numeric conversions */ @@ -180,6 +181,9 @@ literal: case 'l': flags |= LONG; goto again; + case 'q': + flags |= QUAD; + goto again; case 'L': flags |= LONGDBL; goto again; @@ -204,13 +208,13 @@ literal: /* FALLTHROUGH */ case 'd': c = CT_INT; - ccfn = (u_long (*)())strtol; + ccfn = (u_quad_t (*)())strtoq; base = 10; break; case 'i': c = CT_INT; - ccfn = (u_long (*)())strtol; + ccfn = (u_quad_t (*)())strtoq; base = 0; break; @@ -219,13 +223,13 @@ literal: /* FALLTHROUGH */ case 'o': c = CT_INT; - ccfn = strtoul; + ccfn = strtouq; base = 8; break; case 'u': c = CT_INT; - ccfn = strtoul; + ccfn = strtouq; base = 10; break; @@ -235,7 +239,7 @@ literal: case 'x': flags |= PFXOK; /* enable 0x prefixing */ c = CT_INT; - ccfn = strtoul; + ccfn = strtouq; base = 16; break; @@ -267,7 +271,7 @@ literal: case 'p': /* pointer format is like hex */ flags |= POINTER | PFXOK; c = CT_INT; - ccfn = strtoul; + ccfn = strtouq; base = 16; break; @@ -278,6 +282,8 @@ literal: *va_arg(ap, short *) = nread; else if (flags & LONG) *va_arg(ap, long *) = nread; + else if (flags & QUAD) + *va_arg(ap, quad_t *) = nread; else *va_arg(ap, int *) = nread; continue; @@ -292,7 +298,7 @@ literal: if (isupper(c)) flags |= LONG; c = CT_INT; - ccfn = (u_long (*)())strtol; + ccfn = (u_quad_t (*)())strtoq; base = 10; break; } @@ -434,7 +440,7 @@ literal: continue; case CT_INT: - /* scan an integer as if by strtol/strtoul */ + /* scan an integer as if by strtoq/strtouq */ #ifdef hardway if (width == 0 || width > sizeof(buf) - 1) width = sizeof(buf) - 1; @@ -552,7 +558,7 @@ literal: (void) ungetc(c, fp); } if ((flags & SUPPRESS) == 0) { - u_long res; + u_quad_t res; *p = 0; res = (*ccfn)(buf, (char **)NULL, base); @@ -562,6 +568,8 @@ literal: *va_arg(ap, short *) = res; else if (flags & LONG) *va_arg(ap, long *) = res; + else if (flags & QUAD) + *va_arg(ap, quad_t *) = res; else *va_arg(ap, int *) = res; nassigned++; @@ -651,7 +659,9 @@ literal: *p = 0; res = strtod(buf,(char **) NULL); - if (flags & LONG) + if (flags & LONGDBL) + *va_arg(ap, long double *) = res; + else if (flags & LONG) *va_arg(ap, double *) = res; else *va_arg(ap, float *) = res; diff --git a/stdlib.subproj/strtod.c b/stdlib.subproj/strtod.c index 0ed39d8..05c075e 100644 --- a/stdlib.subproj/strtod.c +++ b/stdlib.subproj/strtod.c @@ -386,7 +386,7 @@ extern double rnd_prod(double, double), rnd_quot(double, double); #ifdef __cplusplus extern "C" double strtod(const char *s00, char **se); extern "C" char *__dtoa(double d, int mode, int ndigits, - int *decpt, int *sign, char **rve); + int *decpt, int *sign, char **rve, char **resultp); #endif struct @@ -398,8 +398,6 @@ Bigint { typedef struct Bigint Bigint; - static Bigint *freelist[Kmax+1]; - static Bigint * Balloc #ifdef KR_headers @@ -411,18 +409,13 @@ Balloc int x; Bigint *rv; - if (rv = freelist[k]) { - freelist[k] = rv->next; - } - else { - x = 1 << k; - rv = (Bigint *)MALLOC(sizeof(Bigint) + (x-1)*sizeof(Long)); - rv->k = k; - rv->maxwds = x; - } + x = 1 << k; + rv = (Bigint *)malloc(sizeof(Bigint) + (x-1)*sizeof(Long)); + rv->k = k; + rv->maxwds = x; rv->sign = rv->wds = 0; return rv; - } +} static void Bfree @@ -432,11 +425,8 @@ Bfree (Bigint *v) #endif { - if (v) { - v->next = freelist[v->k]; - freelist[v->k] = v; - } - } + free(v); +} #define Bcopy(x,y) memcpy((char *)&x->sign, (char *)&y->sign, \ y->wds*sizeof(Long) + 2*sizeof(int)) @@ -1916,9 +1906,9 @@ quorem __dtoa #ifdef KR_headers (d, mode, ndigits, decpt, sign, rve) - double d; int mode, ndigits, *decpt, *sign; char **rve; + double d; int mode, ndigits, *decpt, *sign; char **rve, char **resultp; #else - (double d, int mode, int ndigits, int *decpt, int *sign, char **rve) + (double d, int mode, int ndigits, int *decpt, int *sign, char **rve, char **resultp) #endif { /* Arguments ndigits, decpt, sign are similar to those @@ -1966,15 +1956,6 @@ __dtoa Bigint *b, *b1, *delta, *mlo, *mhi, *S; double d2, ds, eps; char *s, *s0; - static Bigint *result; - static int result_k; - - if (result) { - result->k = result_k; - result->maxwds = 1 << result_k; - Bfree(result); - result = 0; - } if (word0(d) & Sign_bit) { /* set sign for everything, including 0's and NaNs */ @@ -2136,11 +2117,8 @@ __dtoa if (i <= 0) i = 1; } - j = sizeof(ULong); - for(result_k = 0; sizeof(Bigint) - sizeof(ULong) + j <= i; - j <<= 1) result_k++; - result = Balloc(result_k); - s = s0 = (char *)result; + *resultp = (char *) malloc(i + 1); + s = s0 = *resultp; if (ilim >= 0 && ilim <= Quick_max && try_quick) { diff --git a/string.subproj/memccpy.c b/string.subproj/memccpy.c index d925f12..657b4f9 100644 --- a/string.subproj/memccpy.c +++ b/string.subproj/memccpy.c @@ -67,9 +67,10 @@ memccpy(t, f, c, n) if (n) { register unsigned char *tp = t; register const unsigned char *fp = f; + register unsigned char uc = c; do { - if ((*tp++ = *fp++) == c) - return (t); + if ((*tp++ = *fp++) == uc) + return (tp); } while (--n != 0); } return (0); diff --git a/sys.subproj/gettimeofday.c b/sys.subproj/gettimeofday.c index d07239b..8aa14ac 100644 --- a/sys.subproj/gettimeofday.c +++ b/sys.subproj/gettimeofday.c @@ -36,21 +36,26 @@ int gettimeofday (struct timeval *tp, struct timezone *tzp) { static int validtz = 0; static struct timezone cached_tz = {0}; + struct timeval localtv; + + if (tzp && (tp == NULL) && (validtz == 0)) { + tp = &localtv; + } if (syscall (SYS_gettimeofday, tp, tzp) < 0) { return (-1); } - if (validtz == 0) { - struct tm *localtm = localtime ((time_t *)&tp->tv_sec); - cached_tz.tz_dsttime = localtm->tm_isdst; - cached_tz.tz_minuteswest = - (-localtm->tm_gmtoff / SECSPERMIN) + - (localtm->tm_isdst * MINSPERHOUR); - validtz = 1; - } if (tzp) { - tzp->tz_dsttime = cached_tz.tz_dsttime; - tzp->tz_minuteswest = cached_tz.tz_minuteswest; + if (validtz == 0) { + struct tm *localtm = localtime ((time_t *)&tp->tv_sec); + cached_tz.tz_dsttime = localtm->tm_isdst; + cached_tz.tz_minuteswest = + (-localtm->tm_gmtoff / SECSPERMIN) + + (localtm->tm_isdst * MINSPERHOUR); + validtz = 1; + } + tzp->tz_dsttime = cached_tz.tz_dsttime; + tzp->tz_minuteswest = cached_tz.tz_minuteswest; } return (0); } diff --git a/sys.subproj/i386.subproj/vfork.s b/sys.subproj/i386.subproj/vfork.s index 714a205..edd5f4f 100644 --- a/sys.subproj/i386.subproj/vfork.s +++ b/sys.subproj/i386.subproj/vfork.s @@ -24,6 +24,7 @@ */ #include "SYS.h" +#if 0 LEAF(_vfork, 0) CALL_EXTERN(__cthread_fork_prepare) #if defined(__DYNAMIC__) @@ -161,4 +162,24 @@ L2: CALL_EXTERN_AGAIN(__cthread_fork_parent) pop %eax ret +#else + +LEAF(_vfork, 0) + popl %ecx + movl $SYS_vfork,%eax; // code for vfork -> eax + UNIX_SYSCALL_TRAP; // do the system call + jnb L1 // jump if CF==0 + pushl %ecx + BRANCH_EXTERN(cerror) + +L1: + orl %edx,%edx // CF=OF=0, ZF set if zero result + jz L2 // parent, since r1 == 0 in parent, 1 in child + xorl %eax,%eax // zero eax + jmp *%ecx + +L2: + jmp *%ecx + +#endif diff --git a/sys.subproj/ppc.subproj/_longjmp.s b/sys.subproj/ppc.subproj/_longjmp.s index 4591e54..6bdeb02 100644 --- a/sys.subproj/ppc.subproj/_longjmp.s +++ b/sys.subproj/ppc.subproj/_longjmp.s @@ -34,11 +34,159 @@ * 8 September 1998 Matt Watson (mwatson@apple.com) * Created. Derived from longjmp.s */ -#include "SYS.h" + #include #include "_setjmp.h" +#define VRSave 256 + +/* special flag bit definitions copied from /osfmk/ppc/thread_act.h */ + +#define floatUsedbit 1 +#define vectorUsedbit 2 + + +#if defined(__DYNAMIC__) + .data + .non_lazy_symbol_pointer + .align 2 +L_memmove$non_lazy_ptr: + .indirect_symbol _memmove + .long 0 + .non_lazy_symbol_pointer + .align 2 +L__cpu_has_altivec$non_lazy_ptr: + .indirect_symbol __cpu_has_altivec + .long 0 + .text +#endif + LEAF(__longjmp) + + ; need to restore FPRs or VRs? + + lwz r5,JMP_flags(r3) + lwz r6,JMP_addr_at_setjmp(r3) + rlwinm r7,r5,0,vectorUsedbit,vectorUsedbit + rlwinm r8,r5,0,floatUsedbit,floatUsedbit + cmpw cr1,r3,r6 ; jmp_buf still at same address? + cmpwi cr3,r7,0 ; set cr3 iff VRs in use (non-volatile CR) + cmpwi cr4,r8,0 ; set cr4 iff FPRs in use (non-volatile CR) + beq+ cr1,LRestoreVRs + + ; jmp_buf was moved since setjmp (or is uninitialized.) + ; We must move VRs and FPRs to be quadword aligned at present address. + + stw r3,JMP_addr_at_setjmp(r3) ; update, in case we longjmp to this again + mr r31,r4 ; save "val" arg across memmove + mr r30,r3 ; and jmp_buf ptr + addi r3,r3,JMP_vr_base_addr + addi r4,r6,JMP_vr_base_addr + rlwinm r3,r3,0,0,27 ; r3 <- QW aligned addr where they should be + rlwinm r4,r4,0,0,27 ; r4 <- QW aligned addr where they originally were + sub r7,r4,r6 ; r7 <- offset of VRs/FPRs within jmp_buf + add r4,r30,r7 ; r4 <- where they are now + li r5,(JMP_buf_end - JMP_vr_base_addr) +#if defined(__DYNAMIC__) + bcl 20,31,1f ; Get pic-base +1: mflr r12 + addis r12, r12, ha16(L_memmove$non_lazy_ptr - 1b) + lwz r12, lo16(L_memmove$non_lazy_ptr - 1b)(r12) + mtctr r12 ; Get address left by dyld + bctrl +#else + bl _memmove +#endif + mr r3,r30 + mr r4,r31 + + ; Restore VRs iff any + ; cr3 - bne if VRs + ; cr4 - bne if FPRs + +LRestoreVRs: + beq+ cr3,LZeroVRSave ; no VRs + lwz r0,JMP_vrsave(r3) + addi r6,r3,JMP_vr_base_addr + cmpwi r0,0 ; any live VRs? + mtspr VRSave,r0 + beq+ LRestoreFPRs + lvx v20,0,r6 + li r7,16*1 + lvx v21,r7,r6 + li r7,16*2 + lvx v22,r7,r6 + li r7,16*3 + lvx v23,r7,r6 + li r7,16*4 + lvx v24,r7,r6 + li r7,16*5 + lvx v25,r7,r6 + li r7,16*6 + lvx v26,r7,r6 + li r7,16*7 + lvx v27,r7,r6 + li r7,16*8 + lvx v28,r7,r6 + li r7,16*9 + lvx v29,r7,r6 + li r7,16*10 + lvx v30,r7,r6 + li r7,16*11 + lvx v31,r7,r6 + b LRestoreFPRs ; skip zeroing VRSave + + ; Zero VRSave iff Altivec is supported, but VRs were not in use + ; at setjmp time. This covers the case where VRs are first used after + ; the setjmp but before the longjmp, and where VRSave is nonzero at + ; the longjmp. We need to zero it now, or it will always remain + ; nonzero since they are sticky bits. + +LZeroVRSave: +#if defined(__DYNAMIC__) + bcl 20,31,1f +1: mflr r9 ; get our address + addis r6,r9,ha16(L__cpu_has_altivec$non_lazy_ptr - 1b) + lwz r7,lo16(L__cpu_has_altivec$non_lazy_ptr - 1b)(r6) + lwz r7,0(r7) ; load the flag +#else + lis r7, ha16(__cpu_has_altivec) + lwz r7, lo16(__cpu_has_altivec)(r7) +#endif + cmpwi r7,0 + li r8,0 + beq LRestoreFPRs ; no Altivec, so skip + mtspr VRSave,r8 + + ; Restore FPRs if any + ; cr4 - bne iff FPRs + +LRestoreFPRs: + beq cr4,LRestoreGPRs ; FPRs not in use at setjmp + addi r6,r3,JMP_fp_base_addr + rlwinm r6,r6,0,0,27 ; mask off low 4 bits to qw align + lfd f14,0*8(r6) + lfd f15,1*8(r6) + lfd f16,2*8(r6) + lfd f17,3*8(r6) + lfd f18,4*8(r6) + lfd f19,5*8(r6) + lfd f20,6*8(r6) + lfd f21,7*8(r6) + lfd f22,8*8(r6) + lfd f23,9*8(r6) + lfd f24,10*8(r6) + lfd f25,11*8(r6) + lfd f26,12*8(r6) + lfd f27,13*8(r6) + lfd f28,14*8(r6) + lfd f29,15*8(r6) + lfd f30,16*8(r6) + lfd f31,17*8(r6) + + ; Restore GPRs + +LRestoreGPRs: lwz r31, JMP_r31(r3) /* r1, r14-r30 */ lwz r1, JMP_r1 (r3) diff --git a/sys.subproj/ppc.subproj/_setjmp.h b/sys.subproj/ppc.subproj/_setjmp.h index e97255c..8a78817 100644 --- a/sys.subproj/ppc.subproj/_setjmp.h +++ b/sys.subproj/ppc.subproj/_setjmp.h @@ -28,6 +28,14 @@ * */ +/* NOTE: jmp_bufs are only 4-byte aligned. This means we + * need to pad before the VR and FPR save areas, so that they + * can be naturally aligned in the buffer. In case a jmp_buf + * is bcopy'd to a different alignment between the setjmp + * and longjmp, we need to save the jmp_buf address in the + * jmp_buf at setjmp time, so we can realign before reloading. + */ + #define JMP_r1 0x00 #define JMP_r2 0x04 #define JMP_r13 0x08 @@ -55,3 +63,13 @@ #define JMP_xer 0x60 #define JMP_sig 0x64 #define JMP_SIGFLAG 0x68 +#define JMP_flags 0x6c +#define JMP_vrsave 0x70 +#define JMP_addr_at_setjmp 0x74 +/* 12 bytes padding here */ +#define JMP_vr_base_addr 0x84 +/* save room for 12 VRs (v20-v31), or 0xC0 bytes */ +#define JMP_fp_base_addr 0x144 +/* save room for 18 FPRs (f14-f31), or 0x90 bytes */ +#define JMP_buf_end 0x1d4 + diff --git a/sys.subproj/ppc.subproj/_setjmp.s b/sys.subproj/ppc.subproj/_setjmp.s index 2be62c8..c69f9ad 100644 --- a/sys.subproj/ppc.subproj/_setjmp.s +++ b/sys.subproj/ppc.subproj/_setjmp.s @@ -33,10 +33,20 @@ * Created. Derived from setjmp.s */ -#include "SYS.h" + #include #include "_setjmp.h" +#define VRSave 256 + +/* special flag bit definitions copied from /osfmk/ppc/thread_act.h */ + +#define floatUsedbit 1 +#define vectorUsedbit 2 + +#define FlagsFastTrap 0x7FF3 + + LEAF(__setjmp) stw r31, JMP_r31(r3) /* r1, r2, r13-r30 */ @@ -68,6 +78,77 @@ LEAF(__setjmp) stw r5, JMP_lr(r3) stw r6, JMP_ctr(r3) stw r7, JMP_xer(r3) - li r3, 0 + + mr r31,r3 ; save jmp_buf ptr + li r0,FlagsFastTrap + sc ; get FPR-inuse and VR-inuse flags from kernel + rlwinm r4,r3,0,floatUsedbit,floatUsedbit + rlwinm. r5,r3,0,vectorUsedbit,vectorUsedbit + cmpwi cr1,r4,0 ; set CR1 bne iff FPRs in use + stw r3,JMP_flags(r31) + stw r31,JMP_addr_at_setjmp(r31) + mr r3,r31 ; restore jmp_buf ptr + lwz r31,JMP_r31(r31) + beq LSaveFPRsIfNecessary ; skip if vectorUsedbit was 0 + + ; must save VRs and VRSAVE + + mfspr r4,VRSave + andi. r0,r4,0xFFF ; we only care about v20-v31 + stw r0,JMP_vrsave(r3) ; set up effective VRSAVE + beq LSaveFPRsIfNecessary ; no live non-volatile VRs + addi r6,r3,JMP_vr_base_addr + stvx v20,0,r6 + li r4,16*1 + stvx v21,r4,r6 + li r4,16*2 + stvx v22,r4,r6 + li r4,16*3 + stvx v23,r4,r6 + li r4,16*4 + stvx v24,r4,r6 + li r4,16*5 + stvx v25,r4,r6 + li r4,16*6 + stvx v26,r4,r6 + li r4,16*7 + stvx v27,r4,r6 + li r4,16*8 + stvx v28,r4,r6 + li r4,16*9 + stvx v29,r4,r6 + li r4,16*10 + stvx v30,r4,r6 + li r4,16*11 + stvx v31,r4,r6 + + ; must save FPRs if they are live in this thread + ; CR1 = bne iff FPRs are in use + +LSaveFPRsIfNecessary: + beq cr1,LExit ; FPRs not in use + addi r6,r3,JMP_fp_base_addr + rlwinm r6,r6,0,0,27 ; mask off low 4 bits to qw align + stfd f14,0*8(r6) + stfd f15,1*8(r6) + stfd f16,2*8(r6) + stfd f17,3*8(r6) + stfd f18,4*8(r6) + stfd f19,5*8(r6) + stfd f20,6*8(r6) + stfd f21,7*8(r6) + stfd f22,8*8(r6) + stfd f23,9*8(r6) + stfd f24,10*8(r6) + stfd f25,11*8(r6) + stfd f26,12*8(r6) + stfd f27,13*8(r6) + stfd f28,14*8(r6) + stfd f29,15*8(r6) + stfd f30,16*8(r6) + stfd f31,17*8(r6) + +LExit: + li r3, 0 blr diff --git a/sys.subproj/ppc.subproj/ur_cthread.s b/sys.subproj/ppc.subproj/ur_cthread.s index f3695ba..50ff2be 100644 --- a/sys.subproj/ppc.subproj/ur_cthread.s +++ b/sys.subproj/ppc.subproj/ur_cthread.s @@ -21,8 +21,8 @@ */ .text .align 2 - .globl __pthread_self -__pthread_self: + .globl _pthread_self +_pthread_self: li r0, 0x7FF2 sc blr diff --git a/sys.subproj/ppc.subproj/vfork.s b/sys.subproj/ppc.subproj/vfork.s index 6a3277a..14bc4f3 100644 --- a/sys.subproj/ppc.subproj/vfork.s +++ b/sys.subproj/ppc.subproj/vfork.s @@ -29,7 +29,7 @@ * */ -#if 1 +#if 0 #import #import #import diff --git a/threads.subproj/Makefile b/threads.subproj/Makefile index 66f86ea..a61e757 100644 --- a/threads.subproj/Makefile +++ b/threads.subproj/Makefile @@ -14,7 +14,7 @@ PROJECT_TYPE = Component HFILES = cthread_internals.h cthreads.h -CFILES = cprocs.c cthreads.c lu_utils.c mig_support.c threads_data.c +CFILES = cprocs.c cthreads.c lu_utils.c mig_support.c SUBPROJECTS = i386.subproj ppc.subproj diff --git a/threads.subproj/PB.project b/threads.subproj/PB.project index 0fbe5d1..e63fd07 100644 --- a/threads.subproj/PB.project +++ b/threads.subproj/PB.project @@ -2,7 +2,7 @@ DYNAMIC_CODE_GEN = YES; FILESTABLE = { H_FILES = (cthread_internals.h, cthreads.h); - OTHER_LINKED = (cprocs.c, cthreads.c, lu_utils.c, mig_support.c, threads_data.c); + OTHER_LINKED = (cprocs.c, cthreads.c, lu_utils.c, mig_support.c); OTHER_SOURCES = (Makefile.preamble, Makefile, Makefile.postamble); PROJECT_HEADERS = (cthread_internals.h, cthreads.h); SUBPROJECTS = (i386.subproj, ppc.subproj); diff --git a/threads.subproj/i386.subproj/thread.c b/threads.subproj/i386.subproj/thread.c index ee31e52..64595c3 100644 --- a/threads.subproj/i386.subproj/thread.c +++ b/threads.subproj/i386.subproj/thread.c @@ -48,7 +48,7 @@ _pthread_set_self(p) } void * -_pthread_self() +pthread_self() { asm("movl $0, %eax"); asm("lcall $0x3b, $0"); diff --git a/threads.subproj/threads_data.c b/threads.subproj/threads_data.c deleted file mode 100644 index 587b938..0000000 --- a/threads.subproj/threads_data.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * This file contains global data and the size of the global data can NOT - * change or otherwise it would make the shared library incompatable. It - * is padded so that new data can take the place of storage occupied by part - * of it. - */ -int msg_send_timeout = 100; /* milliseconds */ -int msg_receive_timeout = 10; /* milliseconds */ -int mutex_spin_limit = 0; -int cthread_stack_mask = 0; -extern void cthread_init(); -unsigned int cproc_default_stack_size = 1000000; -int condition_spin_limit = 0; -int condition_yield_limit = 7; -unsigned int initial_stack_boundary = 0; -unsigned int cthread_stack_base = 0; /* Base for stack allocation */ -int malloc_lock = 0; /* - * Needs to be shared between malloc.o - * and malloc_utils.o - */ - -/* global data padding, must NOT be static */ -char _threads_data_padding[208] = { 0 }; diff --git a/util.subproj/pty.c b/util.subproj/pty.c index 8c9fc0e..aa0b2ad 100644 --- a/util.subproj/pty.c +++ b/util.subproj/pty.c @@ -82,7 +82,7 @@ int openpty(amaster, aslave, name, termp, winp) else ttygid = -1; - for (cp1 = "pqrs"; *cp1; cp1++) { + for (cp1 = "pqrstuvwxy"; *cp1; cp1++) { line[8] = *cp1; for (cp2 = "0123456789abcdef"; *cp2; cp2++) { line[5] = 'p'; -- 2.47.2