From 3b2a1fe8d3d02703ddca1b0ead469074d4e47820 Mon Sep 17 00:00:00 2001
From: Apple <opensource@apple.com>
Date: Wed, 29 Aug 2001 23:32:14 +0000
Subject: [PATCH] Libc-186.tar.gz

---
 Makefile.postamble                    |   1 +
 gen.subproj/crypt.c                   |  46 +-
 gen.subproj/popen.c                   |  57 +-
 gen.subproj/ppc.subproj/Makefile      |   8 +-
 gen.subproj/ppc.subproj/PB.project    |   4 +-
 gen.subproj/ppc.subproj/bcopy.s       | 410 -----------
 gen.subproj/ppc.subproj/blockmoof.s   | 940 ++++++++++++++++++++++++++
 gen.subproj/ppc.subproj/memcpy.s      |  23 -
 gen.subproj/ppc.subproj/memmove.s     |  23 -
 gen.subproj/scalable_malloc.c         |  42 +-
 locale.subproj/rune.c                 |   2 +-
 locale.subproj/setlocale.c            |   2 +-
 mach.subproj/mach_init.c              |   6 +-
 pthreads.subproj/pthread.c            | 100 ++-
 pthreads.subproj/pthread_cond.c       |   3 +-
 pthreads.subproj/pthread_internals.h  |  35 +-
 pthreads.subproj/pthread_mutex.c      |   4 +-
 stdio.subproj/vfprintf.c              |  53 +-
 stdio.subproj/vfscanf.c               |  38 +-
 stdlib.subproj/strtod.c               |  46 +-
 string.subproj/memccpy.c              |   5 +-
 sys.subproj/gettimeofday.c            |  25 +-
 sys.subproj/i386.subproj/vfork.s      |  21 +
 sys.subproj/ppc.subproj/_longjmp.s    | 150 +++-
 sys.subproj/ppc.subproj/_setjmp.h     |  18 +
 sys.subproj/ppc.subproj/_setjmp.s     |  85 ++-
 sys.subproj/ppc.subproj/ur_cthread.s  |   4 +-
 sys.subproj/ppc.subproj/vfork.s       |   2 +-
 threads.subproj/Makefile              |   2 +-
 threads.subproj/PB.project            |   2 +-
 threads.subproj/i386.subproj/thread.c |   2 +-
 threads.subproj/threads_data.c        |  44 --
 util.subproj/pty.c                    |   2 +-
 33 files changed, 1485 insertions(+), 720 deletions(-)
 delete mode 100644 gen.subproj/ppc.subproj/bcopy.s
 create mode 100755 gen.subproj/ppc.subproj/blockmoof.s
 delete mode 100644 gen.subproj/ppc.subproj/memcpy.s
 delete mode 100644 gen.subproj/ppc.subproj/memmove.s
 delete mode 100644 threads.subproj/threads_data.c

diff --git a/Makefile.postamble b/Makefile.postamble
index e702b54..2083c13 100644
--- a/Makefile.postamble
+++ b/Makefile.postamble
@@ -5,6 +5,7 @@ PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(DEBUG_SUFFIX)$(LIBRARY_EXT)
 PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(PROFILE_SUFFIX)$(LIBRARY_EXT)
 PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(STATIC_SUFFIX)$(LIBRARY_EXT)
 RECURSIVE_FLAGS += "LINK_SUBPROJECTS = NO"
+OTHER_CFLAGS += -no-cpp-precomp -force_cpusubtype_ALL
 
 static:
 	$(SILENT) unset $(CUMULATIVE_VARIABLES) ||: ; \
diff --git a/gen.subproj/crypt.c b/gen.subproj/crypt.c
index 5e2caec..2f56953 100644
--- a/gen.subproj/crypt.c
+++ b/gen.subproj/crypt.c
@@ -59,6 +59,7 @@
 #include <unistd.h>
 #include <limits.h>
 #include <pwd.h>
+#include <stdlib.h>
 
 /*
  * UNIX password, and DES, encryption.
@@ -465,19 +466,24 @@ static unsigned char itoa64[] =		/* 0..63 => ascii-64 */
 static unsigned char a64toi[128];	/* ascii-64 => 0..63 */
 
 /* Initial key schedule permutation */
-static C_block	PC1ROT[64/CHUNKBITS][1<<CHUNKBITS];
+// static C_block	PC1ROT[64/CHUNKBITS][1<<CHUNKBITS];
+static C_block	*PC1ROT;
 
 /* Subsequent key schedule rotation permutations */
-static C_block	PC2ROT[2][64/CHUNKBITS][1<<CHUNKBITS];
+// static C_block	PC2ROT[2][64/CHUNKBITS][1<<CHUNKBITS];
+static C_block	*PC2ROT[2];
 
 /* Initial permutation/expansion table */
-static C_block	IE3264[32/CHUNKBITS][1<<CHUNKBITS];
+// static C_block	IE3264[32/CHUNKBITS][1<<CHUNKBITS];
+static C_block	*IE3264;
 
 /* Table that combines the S, P, and E operations.  */
-static long SPE[2][8][64];
+// static long SPE[2][8][64];
+static long *SPE;
 
 /* compressed/interleaved => final permutation table */
-static C_block	CF6464[64/CHUNKBITS][1<<CHUNKBITS];
+// static C_block	CF6464[64/CHUNKBITS][1<<CHUNKBITS];
+static C_block	*CF6464;
 
 
 /* ==================================== */
@@ -606,13 +612,13 @@ STATIC int des_setkey(key)
 		des_ready = 1;
 	}
 
-	PERM6464(K,K0,K1,(unsigned char *)key,(C_block *)PC1ROT);
+	PERM6464(K,K0,K1,(unsigned char *)key,PC1ROT);
 	key = (char *)&KS[0];
 	STORE(K&~0x03030303L, K0&~0x03030303L, K1, *(C_block *)key);
 	for (i = 1; i < 16; i++) {
 		key += sizeof(C_block);
 		STORE(K,K0,K1,*(C_block *)key);
-		ptabp = (C_block *)PC2ROT[Rotates[i]-1];
+		ptabp = PC2ROT[Rotates[i]-1];
 		PERM6464(K,K0,K1,(unsigned char *)key,ptabp);
 		STORE(K&~0x03030303L, K0&~0x03030303L, K1, *(C_block *)key);
 	}
@@ -667,8 +673,8 @@ STATIC int des_cipher(in, out, salt, num_iter)
 	R1 = (R1 >> 1) & 0x55555555L;
 	L1 = R0 | R1;		/* L1 is the odd-numbered input bits */
 	STORE(L,L0,L1,B);
-	PERM3264(L,L0,L1,B.b,  (C_block *)IE3264);	/* even bits */
-	PERM3264(R,R0,R1,B.b+4,(C_block *)IE3264);	/* odd bits */
+	PERM3264(L,L0,L1,B.b,IE3264);	/* even bits */
+	PERM3264(R,R0,R1,B.b+4,IE3264);	/* odd bits */
 
 	if (num_iter >= 0)
 	{		/* encryption */
@@ -689,14 +695,14 @@ STATIC int des_cipher(in, out, salt, num_iter)
 #define	SPTAB(t, i)	(*(long *)((unsigned char *)t + i*(sizeof(long)/4)))
 #if defined(gould)
 			/* use this if B.b[i] is evaluated just once ... */
-#define	DOXOR(x,y,i)	x^=SPTAB(SPE[0][i],B.b[i]); y^=SPTAB(SPE[1][i],B.b[i]);
+#define	DOXOR(x,y,i)	x^=SPTAB(&SPE[i * 64],B.b[i]); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],B.b[i]);
 #else
 #if defined(pdp11)
 			/* use this if your "long" int indexing is slow */
-#define	DOXOR(x,y,i)	j=B.b[i]; x^=SPTAB(SPE[0][i],j); y^=SPTAB(SPE[1][i],j);
+#define	DOXOR(x,y,i)	j=B.b[i]; x^=SPTAB(&SPE[i * 64],j); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],j);
 #else
 			/* use this if "k" is allocated to a register ... */
-#define	DOXOR(x,y,i)	k=B.b[i]; x^=SPTAB(SPE[0][i],k); y^=SPTAB(SPE[1][i],k);
+#define	DOXOR(x,y,i)	k=B.b[i]; x^=SPTAB(&SPE[i * 64],k); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],k);
 #endif
 #endif
 
@@ -731,7 +737,7 @@ STATIC int des_cipher(in, out, salt, num_iter)
 	L0 = ((L0 >> 3) & 0x0f0f0f0fL) | ((L1 << 1) & 0xf0f0f0f0L);
 	L1 = ((R0 >> 3) & 0x0f0f0f0fL) | ((R1 << 1) & 0xf0f0f0f0L);
 	STORE(L,L0,L1,B);
-	PERM6464(L,L0,L1,B.b, (C_block *)CF6464);
+	PERM6464(L,L0,L1,B.b,CF6464);
 #if defined(MUST_ALIGN)
 	STORE(L,L0,L1,B);
 	out[0] = B.b[0]; out[1] = B.b[1]; out[2] = B.b[2]; out[3] = B.b[3];
@@ -781,6 +787,9 @@ STATIC void init_des()
 #ifdef DEBUG
 	prtab("pc1tab", perm, 8);
 #endif
+	PC1ROT = (C_block *)calloc(sizeof(C_block), (64/CHUNKBITS) * (1<<CHUNKBITS));
+	for (i = 0; i < 2; i++)
+		PC2ROT[i] = (C_block *)calloc(sizeof(C_block), (64/CHUNKBITS) * (1<<CHUNKBITS));
 	init_perm(PC1ROT, perm, 8, 8);
 
 	/*
@@ -829,6 +838,7 @@ STATIC void init_des()
 #ifdef DEBUG
 	prtab("ietab", perm, 8);
 #endif
+	IE3264 = (C_block *)calloc(sizeof(C_block), (32/CHUNKBITS) * (1<<CHUNKBITS));
 	init_perm(IE3264, perm, 4, 8);
 
 	/*
@@ -846,6 +856,8 @@ STATIC void init_des()
 #ifdef DEBUG
 	prtab("cftab", perm, 8);
 #endif
+	CF6464 = (C_block *)calloc(sizeof(C_block), (64/CHUNKBITS) * (1<<CHUNKBITS));
+	SPE = (long *)calloc(sizeof(long), 2 * 8 * 64);
 	init_perm(CF6464, perm, 8, 8);
 
 	/*
@@ -873,11 +885,11 @@ STATIC void init_des()
 			k = 0;
 			for (i = 24; --i >= 0; )
 				k = (k<<1) | tmp32[perm[i]-1];
-			TO_SIX_BIT(SPE[0][tableno][j], k);
+			TO_SIX_BIT(SPE[(tableno * 64) + j], k);
 			k = 0;
 			for (i = 24; --i >= 0; )
 				k = (k<<1) | tmp32[perm[i+24]-1];
-			TO_SIX_BIT(SPE[1][tableno][j], k);
+			TO_SIX_BIT(SPE[(8 * 64) + (tableno * 64) + j], k);
 		}
 	}
 }
@@ -891,7 +903,7 @@ STATIC void init_des()
  * "perm" must be all-zeroes on entry to this routine.
  */
 STATIC void init_perm(perm, p, chars_in, chars_out)
-	C_block perm[64/CHUNKBITS][1<<CHUNKBITS];
+	C_block *perm;
 	unsigned char p[64];
 	int chars_in, chars_out;
 {
@@ -905,7 +917,7 @@ STATIC void init_perm(perm, p, chars_in, chars_out)
 		l = 1<<(l&(CHUNKBITS-1));	/* mask for this bit */
 		for (j = 0; j < (1<<CHUNKBITS); j++) {	/* each chunk value */
 			if ((j & l) != 0)
-				perm[i][j].b[k>>3] |= 1<<(k&07);
+				perm[(i * (1<<CHUNKBITS)) + j].b[k>>3] |= 1<<(k&07);
 		}
 	}
 }
diff --git a/gen.subproj/popen.c b/gen.subproj/popen.c
index 7729280..885d6c6 100644
--- a/gen.subproj/popen.c
+++ b/gen.subproj/popen.c
@@ -2,13 +2,13 @@
  * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
- * 
+ *
  * The contents of this file constitute Original Code as defined in and
  * are subject to the Apple Public Source License Version 1.1 (the
  * "License").  You may not use this file except in compliance with the
  * License.  Please obtain a copy of the License at
  * http://www.apple.com/publicsource and read it before using this file.
- * 
+ *
  * This Original Code and all software distributed under the License are
  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -16,7 +16,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  * License for the specific language governing rights and limitations
  * under the License.
- * 
+ *
  * @APPLE_LICENSE_HEADER_END@
  */
 /*
@@ -55,7 +55,6 @@
  * SUCH DAMAGE.
  */
 
-
 #include <sys/param.h>
 #include <sys/wait.h>
 #include <sys/socket.h>
@@ -67,6 +66,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <paths.h>
+#include <crt_externs.h>
+
+#define environ *(_NSGetEnviron())
 
 static struct pid {
 	struct pid *next;
@@ -81,38 +83,57 @@ popen(command, type)
 	struct pid *cur;
 	FILE *iop;
 	int pdes[2], pid, twoway;
+	char *argv[4];
+	struct pid *p;
 
 	if (strchr(type, '+')) {
 		twoway = 1;
 		type = "r+";
-		if (socketpair(AF_UNIX, SOCK_STREAM, 0, pdes) < 0)
-			return (NULL);
+                if (socketpair(AF_UNIX, SOCK_STREAM, 0, pdes) < 0)
+                        return (NULL);
 	} else  {
 		twoway = 0;
-		if (*type != 'r' && *type != 'w' || type[1] ||
-		    (pipe(pdes) < 0))
+		if ((*type != 'r' && *type != 'w') || type[1])
 			return (NULL);
 	}
+	if (pipe(pdes) < 0)
+		return (NULL);
 
-	if ((cur = malloc(sizeof(struct pid))) == NULL)
+	if ((cur = malloc(sizeof(struct pid))) == NULL) {
+		(void)close(pdes[0]);
+		(void)close(pdes[1]);
 		return (NULL);
+	}
+
+	argv[0] = "sh";
+	argv[1] = "-c";
+	argv[2] = (char *)command;
+	argv[3] = NULL;
 
 	switch (pid = vfork()) {
 	case -1:			/* Error. */
 		(void)close(pdes[0]);
 		(void)close(pdes[1]);
-		(void)free(cur);
+		free(cur);
 		return (NULL);
 		/* NOTREACHED */
 	case 0:				/* Child. */
 		if (*type == 'r') {
+			/*
+			 * The _dup2() to STDIN_FILENO is repeated to avoid
+			 * writing to pdes[1], which might corrupt the
+			 * parent's copy.  This isn't good enough in
+			 * general, since the _exit() is no return, so
+			 * the compiler is free to corrupt all the local
+			 * variables.
+			 */
+			(void)close(pdes[0]);
 			if (pdes[1] != STDOUT_FILENO) {
 				(void)dup2(pdes[1], STDOUT_FILENO);
 				(void)close(pdes[1]);
-				pdes[1] = STDOUT_FILENO;
-			}
-			(void) close(pdes[0]);
-			if (twoway && (pdes[1] != STDIN_FILENO))
+				if (twoway)
+					(void)dup2(STDOUT_FILENO, STDIN_FILENO);
+			} else if (twoway && (pdes[1] != STDIN_FILENO))
 				(void)dup2(pdes[1], STDIN_FILENO);
 		} else {
 			if (pdes[0] != STDIN_FILENO) {
@@ -120,8 +141,11 @@ popen(command, type)
 				(void)close(pdes[0]);
 			}
 			(void)close(pdes[1]);
+			}
+		for (p = pidlist; p; p = p->next) {
+			(void)close(fileno(p->fp));
 		}
-		execl(_PATH_BSHELL, "sh", "-c", command, NULL);
+		execve(_PATH_BSHELL, argv, environ);
 		_exit(127);
 		/* NOTREACHED */
 	}
@@ -154,7 +178,6 @@ pclose(iop)
 	FILE *iop;
 {
 	register struct pid *cur, *last;
-	int omask;
 	int pstat;
 	pid_t pid;
 
@@ -168,7 +191,7 @@ pclose(iop)
 	(void)fclose(iop);
 
 	do {
-		pid = waitpid(cur->pid, &pstat, 0);
+		pid = wait4(cur->pid, &pstat, 0, (struct rusage *)0);
 	} while (pid == -1 && errno == EINTR);
 
 	/* Remove the entry from the linked list. */
diff --git a/gen.subproj/ppc.subproj/Makefile b/gen.subproj/ppc.subproj/Makefile
index 2a0ec70..0c95a7b 100644
--- a/gen.subproj/ppc.subproj/Makefile
+++ b/gen.subproj/ppc.subproj/Makefile
@@ -14,16 +14,16 @@ PROJECT_TYPE = Component
 
 HFILES = fp.h genassym.h
 
-OTHERLINKED = abs.s bcopy.s bzero.s ffs.s mcount.s memcpy.s\
-              memmove.s strlen.s
+OTHERLINKED = abs.s blockmoof.s bzero.s ffs.s mcount.s \
+              strlen.s
 
 CFILES = bcmp.c ecvt.c insque.c isinf.c remque.c setjmperr.c\
          strcat.c strcpy.c strncat.c strncmp.c strncpy.c
 
 OTHERSRCS = Makefile.preamble Makefile Makefile.postamble
 
-OTHERLINKEDOFILES = abs.o bcopy.o bzero.o ffs.o mcount.o memcpy.o\
-                    memmove.o strlen.o
+OTHERLINKEDOFILES = abs.o blockmoof.o bzero.o ffs.o mcount.o \
+                    strlen.o
 
 MAKEFILEDIR = $(MAKEFILEPATH)/pb_makefiles
 CODE_GEN_STYLE = DYNAMIC
diff --git a/gen.subproj/ppc.subproj/PB.project b/gen.subproj/ppc.subproj/PB.project
index d1d8013..6fec101 100644
--- a/gen.subproj/ppc.subproj/PB.project
+++ b/gen.subproj/ppc.subproj/PB.project
@@ -5,15 +5,13 @@
         OTHER_LINKED = (
             abs.s, 
             bcmp.c, 
-            bcopy.s, 
+            blockmoof.s, 
             bzero.s, 
             ecvt.c, 
             ffs.s, 
             insque.c, 
             isinf.c, 
             mcount.s, 
-            memcpy.s, 
-            memmove.s, 
             remque.c, 
             setjmperr.c, 
             strcat.c, 
diff --git a/gen.subproj/ppc.subproj/bcopy.s b/gen.subproj/ppc.subproj/bcopy.s
deleted file mode 100644
index 38ffd42..0000000
--- a/gen.subproj/ppc.subproj/bcopy.s
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- */
-;
-;			Copy bytes of data around. handles overlapped data.
-;
-;			Change this to use Altivec later on
-;
-
-;	
-; void bcopy(from, to, nbytes)
-;
-
-;			Use CR5_lt to indicate non-cached
-#define noncache	20
-.text
-.align 2
-#if !defined(MEMCPY) && !defined(MEMMOVE)
-.globl _bcopy
-_bcopy:
-			crclr		noncache					; Set cached
-			cmplw		cr1,r4,r3					; Compare "to" and "from"
-			mr.		r5,r5						; Check if we have a 0 length
-			mr		r6,r3						; Set source
-			beqlr-		cr1						; Bail if "to" and "from" are the same	
-			beqlr-								; Bail if length is 0
-			b		Lcopyit						; Go copy it...
-
-;
-;			When we move the memory, forward overlays must be handled.  We
-;			also can not use the cache instructions if we are from bcopy_nc.
-;			We need to preserve R3 because it needs to be returned for memcpy.
-;			We can be interrupted and lose control here.
-;
-;			There is no stack, so in order to used floating point, we would
-;			need to take the FP exception. Any potential gains by using FP 
-;			would be more than eaten up by this.
-;
-;			Later, we should used Altivec for large moves.
-;
-
-#else
-#if defined(MEMCPY)
-.globl _memcpy
-_memcpy:
-#endif
-
-#if defined(MEMMOVE)
-.globl _memmove
-_memmove:
-#endif
-			cmplw		cr1,r3,r4					; "to" and "from" the same?
-			mr		r6,r4						; Set the "from"
-			mr.		r5,r5						; Length zero?
-                        crclr   	noncache                                        ; Set cached
-			mr		r4,r3						; Set the "to"
-			beqlr-		cr1						; "to" and "from" are the same
-			beqlr-								; Length is 0
-#endif
-Lcopyit:		sub		r12,r4,r6					; Get potential overlap (negative if backward move)
-			lis		r8,0x7FFF					; Start up a mask
-			srawi		r11,r12,31					; Propagate the sign bit
-			dcbt		0,r6						; Touch in the first source line
-			cntlzw		r7,r5						; Get the highest power of 2 factor of the length
-			ori		r8,r8,0xFFFF					; Make limit 0x7FFFFFFF
-			xor		r9,r12,r11					; If sink - source was negative, invert bits
-			srw		r8,r8,r7					; Get move length limitation
-			sub		r9,r9,r11					; If sink - source was negative, add 1 and get absolute value
-			cmplw		r12,r5						; See if we actually forward overlap
-			cmplwi		cr7,r9,32					; See if at least a line between  source and sink
-			dcbtst		0,r4						; Touch in the first sink line
-			cmplwi		cr1,r5,32					; Are we moving more than a line?
-                        cror    	noncache,noncache,28				; Set to not DCBZ output line if not enough space
-			blt-		Lfwdovrlap					; This is a forward overlapping area, handle it...
-
-;
-;			R4 = sink
-;			R5 = length
-;			R6 = source
-;
-			
-;
-;			Here we figure out how much we have to move to get the sink onto a
-;			cache boundary.  If we can, and there are still more that 32 bytes
-;			left to move, we can really speed things up by DCBZing the sink line.
-;			We can not do this if noncache is set because we will take an 
-;			alignment exception.
-
-			neg		r0,r4						; Get the number of bytes to move to align to a line boundary
-			rlwinm.		r0,r0,0,27,31					; Clean it up and test it
-			and		r0,r0,r8					; limit to the maximum front end move
-			mtcrf		3,r0						; Make branch mask for partial moves
-			sub		r5,r5,r0					; Set the length left to move
-			beq		Lalline						; Already on a line...
-			
-			bf		31,Lalhalf					; No single byte to do...
-			lbz		r7,0(r6)					; Get the byte
-			addi		r6,r6,1						; Point to the next
-			stb		r7,0(r4)					; Save the single
-			addi		r4,r4,1						; Bump sink
-			
-;			Sink is halfword aligned here
-
-Lalhalf:		bf		30,Lalword					; No halfword to do...
-			lhz		r7,0(r6)					; Get the halfword
-			addi		r6,r6,2						; Point to the next
-			sth		r7,0(r4)					; Save the halfword
-			addi		r4,r4,2						; Bump sink
-			
-;			Sink is word aligned here
-
-Lalword:		bf		29,Laldouble					; No word to do...
-			lwz		r7,0(r6)					; Get the word
-			addi		r6,r6,4						; Point to the next
-			stw		r7,0(r4)					; Save the word
-			addi		r4,r4,4						; Bump sink
-			
-;			Sink is double aligned here
-
-Laldouble:		bf		28,Lalquad					; No double to do...
-			lwz		r7,0(r6)					; Get the first word
-			lwz		r8,4(r6)					; Get the second word
-			addi		r6,r6,8						; Point to the next
-			stw		r7,0(r4)					; Save the first word
-			stw		r8,4(r4)					; Save the second word
-			addi		r4,r4,8						; Bump sink
-			
-;			Sink is quadword aligned here
-
-Lalquad:       		bf		27,Lalline					; No quad to do...
-			lwz		r7,0(r6)					; Get the first word
-			lwz		r8,4(r6)					; Get the second word
-			lwz		r9,8(r6)					; Get the third word
-			stw		r7,0(r4)					; Save the first word
-			lwz		r11,12(r6)					; Get the fourth word
-			addi		r6,r6,16					; Point to the next
-			stw		r8,4(r4)					; Save the second word
-			stw		r9,8(r4)					; Save the third word
-			stw		r11,12(r4)					; Save the fourth word
-			addi		r4,r4,16					; Bump sink
-			
-;			Sink is line aligned here
-
-Lalline:       		rlwinm.		r0,r5,27,5,31					; Get the number of full lines to move
-			mtcrf		3,r5						; Make branch mask for backend partial moves
-			rlwinm		r11,r5,0,0,26					; Get number of bytes to move
-			beq-		Lbackend       					; No full lines to move
-			
-			sub		r5,r5,r11					; Calculate the residual
-                        li              r10,96                                          ; Stride for touch ahead
-
-Lnxtline:		subic.		r0,r0,1						; Account for the line now
-
-			bt-		noncache,Lskipz					; Skip if we are not cached...
-			dcbz		0,r4						; Blow away the whole line because we are replacing it
-                        dcbt		r6,r10                                          ; Touch ahead a bit
-
-Lskipz:			lwz		r7,0(r6)					; Get the first word
-			lwz		r8,4(r6)					; Get the second word
-			lwz		r9,8(r6)					; Get the third word
-			stw		r7,0(r4)					; Save the first word
-			lwz		r11,12(r6)					; Get the fourth word
-			stw		r8,4(r4)					; Save the second word
-			lwz		r7,16(r6)					; Get the fifth word
-			stw		r9,8(r4)					; Save the third word
-			lwz		r8,20(r6)					; Get the sixth word
-			stw		r11,12(r4)					; Save the fourth word
-			lwz		r9,24(r6)					; Get the seventh word
-			stw		r7,16(r4)					; Save the fifth word
-			lwz		r11,28(r6)					; Get the eighth word
-			addi		r6,r6,32					; Point to the next
-			stw		r8,20(r4)					; Save the sixth word
-			stw		r9,24(r4)					; Save the seventh word
-			stw		r11,28(r4)					; Save the eighth word
-			addi		r4,r4,32					; Bump sink
-			bgt+		Lnxtline					; Do the next line, if any...
-
-	
-;			Move backend quadword
-
-Lbackend:		bf		27,Lnoquad					; No quad to do...
-			lwz		r7,0(r6)					; Get the first word
-			lwz		r8,4(r6)					; Get the second word
-			lwz		r9,8(r6)					; Get the third word
-			lwz		r11,12(r6)					; Get the fourth word
-			stw		r7,0(r4)					; Save the first word
-			addi		r6,r6,16					; Point to the next
-			stw		r8,4(r4)					; Save the second word
-			stw		r9,8(r4)					; Save the third word
-			stw		r11,12(r4)					; Save the fourth word
-			addi		r4,r4,16					; Bump sink
-			
-;			Move backend double
-
-Lnoquad:		bf		28,Lnodouble					; No double to do...
-			lwz		r7,0(r6)					; Get the first word
-			lwz		r8,4(r6)					; Get the second word
-			addi		r6,r6,8						; Point to the next
-			stw		r7,0(r4)					; Save the first word
-			stw		r8,4(r4)					; Save the second word
-			addi		r4,r4,8						; Bump sink
-			
-;			Move backend word
-
-Lnodouble:		bf		29,Lnoword					; No word to do...
-			lwz		r7,0(r6)					; Get the word
-			addi		r6,r6,4						; Point to the next
-			stw		r7,0(r4)					; Save the word
-			addi		r4,r4,4						; Bump sink
-			
-;			Move backend halfword
-
-Lnoword:       		bf		30,Lnohalf					; No halfword to do...
-			lhz		r7,0(r6)					; Get the halfword
-			addi		r6,r6,2						; Point to the next
-			sth		r7,0(r4)					; Save the halfword
-			addi		r4,r4,2						; Bump sink
-
-;			Move backend byte
-
-Lnohalf:       		bflr		31						; Leave cuz we are all done...	
-			lbz		r7,0(r6)					; Get the byte
-			stb		r7,0(r4)					; Save the single
-			
-			blr								; Leave cuz we are all done...			
-
-;
-;			0123456789ABCDEF0123456789ABCDEF
-;			 0123456789ABCDEF0123456789ABCDEF
-;										    F
-;										  DE
-;									  9ABC
-;							  12345678
-;             123456789ABCDEF0	
-;            0
-
-;
-;			Here is where we handle a forward overlapping move.  These will be slow
-;			because we can not kill the cache of the destination until after we have
-;			loaded/saved the source area.  Also, because reading memory backwards is
-;			slower when the cache line needs to be loaded because the critical 
-;			doubleword is loaded first, i.e., the last, then it goes back to the first,
-;			and on in order.  That means that when we are at the second to last DW we
-;			have to wait until the whole line is in cache before we can proceed.
-;
-	
-Lfwdovrlap:		add		r4,r5,r4					; Point past the last sink byte
-			add		r6,r5,r6					; Point past the last source byte 
-			and		r0,r4,r8					; Apply movement limit
-			li		r12,-1						; Make sure we touch in the actual line 			
-			mtcrf		3,r0						; Figure out the best way to move backwards			
-			dcbt		r12,r6						; Touch in the last line of source
-			rlwinm.		r0,r0,0,27,31					; Calculate the length to adjust to cache boundary
-			dcbtst		r12,r4						; Touch in the last line of the sink
-			beq-		Lballine						; Aready on cache line boundary
-			
-			sub		r5,r5,r0					; Precaculate move length left after alignment
-			
-			bf		31,Lbalhalf					; No single byte to do...
-			lbz		r7,-1(r6)					; Get the byte
-			subi		r6,r6,1						; Point to the next
-			stb		r7,-1(r4)					; Save the single
-			subi		r4,r4,1						; Bump sink
-			
-;			Sink is halfword aligned here
-
-Lbalhalf:		bf		30,Lbalword					; No halfword to do...
-			lhz		r7,-2(r6)					; Get the halfword
-			subi		r6,r6,2						; Point to the next
-			sth		r7,-2(r4)					; Save the halfword
-			subi		r4,r4,2						; Bump sink
-			
-;			Sink is word aligned here
-
-Lbalword:		bf		29,Lbaldouble					; No word to do...
-			lwz		r7,-4(r6)					; Get the word
-			subi		r6,r6,4						; Point to the next
-			stw		r7,-4(r4)					; Save the word
-			subi		r4,r4,4						; Bump sink
-			
-;			Sink is double aligned here
-
-Lbaldouble:		bf		28,Lbalquad					; No double to do...
-			lwz		r7,-8(r6)					; Get the first word
-			lwz		r8,-4(r6)					; Get the second word
-			subi		r6,r6,8						; Point to the next
-			stw		r7,-8(r4)					; Save the first word
-			stw		r8,-4(r4)					; Save the second word
-			subi		r4,r4,8						; Bump sink
-			
-;			Sink is quadword aligned here
-
-Lbalquad:		bf		27,Lballine					; No quad to do...
-			lwz		r7,-16(r6)					; Get the first word
-			lwz		r8,-12(r6)					; Get the second word
-			lwz		r9,-8(r6)					; Get the third word
-			lwz		r11,-4(r6)					; Get the fourth word
-			stw		r7,-16(r4)					; Save the first word
-			subi		r6,r6,16					; Point to the next
-			stw		r8,-12(r4)					; Save the second word
-			stw		r9,-8(r4)					; Save the third word
-			stw		r11,-4(r4)					; Save the fourth word
-			subi		r4,r4,16					; Bump sink
-			
-;			Sink is line aligned here
-
-Lballine:		rlwinm.		r0,r5,27,5,31					; Get the number of full lines to move
-			mtcrf		3,r5						; Make branch mask for backend partial moves
-			beq-		Lbbackend					; No full lines to move
-
-
-;			Registers in use: 	R0, R1,     R3, R4, R5, R6
-;       		Registers not in use:           R2,                 R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
-			
-Lbnxtline:		subic.		r0,r0,1						; Account for the line now
-
-			lwz		r7,-32(r6)					; Get the first word
-			lwz		r5,-28(r6)					; Get the second word
-			lwz		r2,-24(r6)					; Get the third word
-			lwz		r12,-20(r6)					; Get the third word
-			lwz		r11,-16(r6)					; Get the fifth word
-			lwz		r10,-12(r6)					; Get the sixth word
-			lwz		r9,-8(r6)					; Get the seventh word
-			lwz		r8,-4(r6)					; Get the eighth word
-			subi		r6,r6,32					; Point to the next
-			
-			stw		r7,-32(r4)					; Get the first word
-			ble-		Lbnotouch					; Last time, skip touch of source...
-			dcbt		0,r6						; Touch in next source line
-			
-Lbnotouch:		stw		r5,-28(r4)					; Get the second word
-			stw		r2,-24(r4)					; Get the third word
-			stw		r12,-20(r4)					; Get the third word
-			stw		r11,-16(r4)					; Get the fifth word
-			stw		r10,-12(r4)					; Get the sixth word
-			stw		r9,-8(r4)					; Get the seventh word
-			stw		r8,-4(r4)					; Get the eighth word
-			subi		r4,r4,32					; Bump sink
-			
-			bgt+		Lbnxtline					; Do the next line, if any...
-
-;
-;			Note: We touched these lines in at the beginning
-;
-	
-;			Move backend quadword
-
-Lbbackend:		bf		27,Lbnoquad					; No quad to do...
-			lwz		r7,-16(r6)					; Get the first word
-			lwz		r8,-12(r6)					; Get the second word
-			lwz		r9,-8(r6)					; Get the third word
-			lwz		r11,-4(r6)					; Get the fourth word
-			stw		r7,-16(r4)					; Save the first word
-			subi		r6,r6,16					; Point to the next
-			stw		r8,-12(r4)					; Save the second word
-			stw		r9,-8(r4)					; Save the third word
-			stw		r11,-4(r4)					; Save the fourth word
-			subi		r4,r4,16					; Bump sink
-			
-;			Move backend double
-
-Lbnoquad:		bf		28,Lbnodouble					; No double to do...
-			lwz		r7,-8(r6)					; Get the first word
-			lwz		r8,-4(r6)					; Get the second word
-			subi		r6,r6,8						; Point to the next
-			stw		r7,-8(r4)					; Save the first word
-			stw		r8,-4(r4)					; Save the second word
-			subi		r4,r4,8						; Bump sink
-			
-;			Move backend word
-
-Lbnodouble:		bf		29,Lbnoword					; No word to do...
-			lwz		r7,-4(r6)					; Get the word
-			subi		r6,r6,4						; Point to the next
-			stw		r7,-4(r4)					; Save the word
-			subi		r4,r4,4						; Bump sink
-			
-;			Move backend halfword
-
-Lbnoword:		bf		30,Lbnohalf					; No halfword to do...
-			lhz		r7,-2(r6)					; Get the halfword
-			subi		r6,r6,2						; Point to the next
-			sth		r7,-2(r4)					; Save the halfword
-			subi		r4,r4,2						; Bump sink
-
-;			Move backend byte
-
-Lbnohalf:		bflr		31						; Leave cuz we are all done...	
-			lbz		r7,-1(r6)					; Get the byte
-			stb		r7,-1(r4)					; Save the single
-			
-			blr								; Leave cuz we are all done...			
diff --git a/gen.subproj/ppc.subproj/blockmoof.s b/gen.subproj/ppc.subproj/blockmoof.s
new file mode 100755
index 0000000..947e7f0
--- /dev/null
+++ b/gen.subproj/ppc.subproj/blockmoof.s
@@ -0,0 +1,940 @@
+/*
+ * Copyright (c) 1992-2001 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ * 
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+#include <architecture/ppc/asm_help.h>
+
+// =================================================================================================
+// *** The easiest way to assemble things on Mac OS X is via "cc", so this uses #defines and such.
+// =================================================================================================
+
+// Keep track of whether we have Altivec 
+// This gets set in pthread_init()
+
+.data
+.align 2
+.globl __cpu_has_altivec
+__cpu_has_altivec:
+.long 0
+
+.text
+.align 2
+.globl _bcopy
+.globl _memcpy
+.globl _memmove
+
+_bcopy:
+	mr	r2,r4	// Since bcopy uses (src,dest,count), swap r3,r4
+	mr	r4,r3
+	mr	r3,r2	
+_memcpy:
+_memmove:
+	mr	r2,r3	// Store dest ptr in r2 to preserve r3 on return
+
+// ------------------
+// Standard registers
+
+#define rs	r4
+#define rd	r2
+#define rc	r5
+
+// Should we bother using Altivec?
+
+	cmpwi	r5, 128
+	blt+	LScalar
+
+// Determine whether we have Altivec enabled
+
+	mflr    r0
+	bcl	20,31,1f
+1:
+	mflr    r6
+	mtlr    r0
+	addis   r6, r6, ha16(__cpu_has_altivec - 1b)
+	lwz     r6, lo16(__cpu_has_altivec - 1b)(r6)
+	cmpwi	r6, 0
+	bne+	LAltivec
+	
+// =================================================================================================
+
+//  *****************************************
+//  * S c a l a r B l o c k M o o f D a t a *
+//  *****************************************
+// 
+//  This is the scalar (non-AltiVec) version of BlockMoofData.
+// 
+//		void ScalarBlockMoofData			(ptr sou, ptr dest, long len)
+//		void ScalarBlockMoofDataUncached	(ptr sou, ptr dest, long len)
+// 
+// 
+//  Calling Sequence: 	r3 = source pointer
+// 						r4 = destination pointer
+// 						r5 = length in bytes
+// 
+//  Uses: all volatile registers.
+
+LScalar:
+		cmplwi	cr7,rc,32				//  length <= 32 bytes?
+		cmplw	cr6,rd,rs				//  up or down?
+		mr.		r0,rc					//  copy to r0 for MoveShort, and test for negative
+		bgt		cr7,Lbm1				//  skip if count > 32
+		
+//  Handle short moves (<=32 bytes.)
+
+		beq		cr7,LMove32				//  special case 32-byte blocks
+		blt		cr6,LMoveDownShort		//  move down in memory and return
+		add		rs,rs,rc				//  moving up (right-to-left), so adjust pointers
+		add		rd,rd,rc
+		b		LMoveUpShort			//  move up in memory and return
+
+//  Handle long moves (>32 bytes.)
+
+Lbm1:
+		beqlr	cr6						//  rs==rd, so nothing to move
+		bltlr	cr0						//  length<0, so ignore call and return
+		mflr	r12						//  save return address
+		bge		cr6,Lbm2				//  rd>=rs, so move up
+
+//  Long moves down (left-to-right.)
+
+		neg		r6,rd					//  start to 32-byte-align destination
+		andi.	r0,r6,0x1F				//  r0 <- bytes to move to align destination
+		bnel	LMoveDownShort			//  align destination if necessary
+		bl		LMoveDownLong			//  move 32-byte chunks down
+		andi.	r0,rc,0x1F				//  done?
+		mtlr	r12						//  restore caller's return address
+		bne		LMoveDownShort			//  move trailing leftover bytes and done
+		blr								//  no leftovers, so done
+		
+//  Long moves up (right-to-left.)
+
+Lbm2:
+		add		rs,rs,rc				//  moving up (right-to-left), so adjust pointers
+		add		rd,rd,rc
+		andi.	r0,rd,0x1F				//  r0 <- bytes to move to align destination
+		bnel	LMoveUpShort			//  align destination if necessary
+		bl		LMoveUpLong				//  move 32-byte chunks up
+		andi.	r0,rc,0x1F				//  done?
+		mtlr	r12						//  restore caller's return address
+		bne		LMoveUpShort			//  move trailing leftover bytes and done
+		blr								//  no leftovers, so done
+
+//  ***************
+//  * M O V E 3 2 *
+//  ***************
+// 
+//  Special case subroutine to move a 32-byte block.  MoveDownShort and
+//  MoveUpShort only handle 0..31 bytes, and we believe 32 bytes is too
+//  common a case to send it through the general purpose long-block code.
+//  Since it moves both up and down, we must load all 32 bytes before
+//  storing any.
+// 
+//  Calling Sequence:  rs = source ptr
+// 					 rd = destination ptr
+// 
+//  Uses: r0,r5-r11.
+// 
+
+LMove32:
+		lwz		r0,0(rs)
+		lwz		r5,4(rs)
+		lwz		r6,8(rs)
+		lwz		r7,12(rs)
+		lwz		r8,16(rs)
+		lwz		r9,20(rs)
+		lwz		r10,24(rs)
+		lwz		r11,28(rs)
+		stw		r0,0(rd)
+		stw		r5,4(rd)
+		stw		r6,8(rd)
+		stw		r7,12(rd)
+		stw		r8,16(rd)
+		stw		r9,20(rd)
+		stw		r10,24(rd)
+		stw		r11,28(rd)
+		blr
+		
+
+//  *************************
+//  * M o v e U p S h o r t *
+//  *************************
+// 
+//  Subroutine called to move <32 bytes up in memory (ie, right-to-left).
+// 
+//  Entry conditions: rs = last byte moved from source (right-to-left)
+// 					rd = last byte moved into destination
+//					r0 = #bytes to move (0..31)
+// 
+//  Exit conditions:  rs = updated source ptr
+// 					rd = updated destination ptr
+//					rc = decremented by #bytes moved
+// 
+//  Uses: r0,r6,r7,r8,cr7.
+// 
+
+LMoveUpShort:
+		andi.	r6,r0,0x10				//  test 0x10 bit in length
+		mtcrf	0x1,r0					//  move count to cr7 so we can test bits
+		sub		rc,rc,r0				//  decrement count of bytes remaining to be moved
+		beq		Lmus1					//  skip if 0x10 bit in length is 0
+		lwzu	r0,-16(rs)				//  set, so copy up 16 bytes
+		lwz		r6,4(rs)
+		lwz		r7,8(rs)
+		lwz		r8,12(rs)
+		stwu	r0,-16(rd)
+		stw		r6,4(rd)
+		stw		r7,8(rd)
+		stw		r8,12(rd)
+
+Lmus1:
+		bf		28,Lmus2				//  test 0x08 bit
+		lwzu	r0,-8(rs)
+		lwz		r6,4(rs)
+		stwu	r0,-8(rd)
+		stw		r6,4(rd)
+
+Lmus2:
+		bf		29,Lmus3				//  test 0x4 bit
+		lwzu	r0,-4(rs)
+		stwu	r0,-4(rd)
+
+Lmus3:
+		bf		30,Lmus4				//  test 0x2 bit
+		lhzu	r0,-2(rs)
+		sthu	r0,-2(rd)
+
+Lmus4:
+		bflr	31						//  test 0x1 bit, return if 0
+		lbzu	r0,-1(rs)
+		stbu	r0,-1(rd)
+		blr
+
+
+//  *****************************
+//  * M o v e D o w n S h o r t *
+//  *****************************
+// 
+//  Subroutine called to move <32 bytes down in memory (ie, left-to-right).
+// 
+//  Entry conditions: rs = source pointer
+// 					rd = destination pointer
+//					r0 = #bytes to move (0..31)
+// 
+//  Exit conditions:  rs = ptr to 1st byte not moved
+// 					rd = ptr to 1st byte not moved
+//					rc = decremented by #bytes moved
+// 
+//  Uses: r0,r6,r7,r8,cr7.
+// 
+
+LMoveDownShort:
+		andi.	r6,r0,0x10				//  test 0x10 bit in length
+		mtcrf	0x1,r0					//  move count to cr7 so we can test bits
+		sub		rc,rc,r0				//  decrement count of bytes remaining to be moved
+		beq		Lmds1					//  skip if 0x10 bit in length is 0
+		lwz		r0,0(rs)				//  set, so copy up 16 bytes
+		lwz		r6,4(rs)
+		lwz		r7,8(rs)
+		lwz		r8,12(rs)
+		addi	rs,rs,16
+		stw		r0,0(rd)
+		stw		r6,4(rd)
+		stw		r7,8(rd)
+		stw		r8,12(rd)
+		addi	rd,rd,16
+
+Lmds1:
+		bf		28,Lmds2				//  test 0x08 bit
+		lwz		r0,0(rs)
+		lwz		r6,4(rs)
+		addi	rs,rs,8
+		stw		r0,0(rd)
+		stw		r6,4(rd)
+		addi	rd,rd,8
+
+Lmds2:
+		bf		29,Lmds3				//  test 0x4 bit
+		lwz		r0,0(rs)
+		addi	rs,rs,4
+		stw		r0,0(rd)
+		addi	rd,rd,4
+
+Lmds3:
+		bf		30,Lmds4				//  test 0x2 bit
+		lhz		r0,0(rs)
+		addi	rs,rs,2
+		sth		r0,0(rd)
+		addi	rd,rd,2
+
+Lmds4:
+		bflr	31						//  test 0x1 bit, return if 0
+		lbz		r0,0(rs)
+		addi	rs,rs,1
+		stb		r0,0(rd)
+		addi	rd,rd,1
+		blr
+
+
+//  ***********************
+//  * M o v e U p L o n g *
+//  ***********************
+// 
+//  Subroutine to move 32-byte chunks of memory up (ie, right-to-left.)
+//  The destination is known to be 32-byte aligned, but the source is
+//  *not* necessarily aligned.
+// 
+//  Entry conditions: rs = last byte moved from source (right-to-left)
+// 					rd = last byte moved into destination
+// 					rc = count of bytes to move
+// 					cr = crCached set iff destination is cacheable
+// 
+//  Exit conditions:  rs = updated source ptr
+// 					rd = updated destination ptr
+// 					rc = low order 8 bits of count of bytes to move
+// 
+//  Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
+// 
+
+LMoveUpLong:
+		srwi.	r11,rc,5				// r11 <- #32 byte chunks to move
+		mtctr	r11						//  prepare loop count
+		beqlr							//  return if no chunks to move
+		andi.	r0,rs,7					//  is source at least doubleword aligned?
+		beq		Lmup3					//  yes, can optimize this case
+		mtcrf	0x1,rc					//  save low bits of count
+		mtcrf	0x2,rc					//  (one cr at a time, as 604 prefers)
+
+Lmup1:									//  loop over each 32-byte-chunk
+		lwzu	r0,-32(rs)
+		subi	rd,rd,32				//  prepare destination address for 'dcbz'
+		lwz		r5,4(rs)
+		lwz		r6,8(rs)
+		lwz		r7,12(rs)
+		lwz		r8,16(rs)
+		lwz		r9,20(rs)
+		lwz		r10,24(rs)
+		lwz		r11,28(rs)
+		stw		r0,0(rd)
+		stw		r5,4(rd)
+		stw		r6,8(rd)
+		stw		r7,12(rd)
+		stw		r8,16(rd)
+		stw		r9,20(rd)
+		stw		r10,24(rd)
+		stw		r11,28(rd)
+		bdnz	Lmup1
+		mfcr	rc						//  restore low bits of count
+		blr								//  return to caller
+
+//  Aligned operands, so use d.p. floating point registers to move data.
+
+Lmup3:
+		lfdu	f0,-32(rs)
+		subi	rd,rd,32				//  prepare destination address for 'dcbz'
+		lfd		f1,8(rs)
+		lfd		f2,16(rs)
+		lfd		f3,24(rs)
+		stfd	f0,0(rd)
+		stfd	f1,8(rd)
+		stfd	f2,16(rd)
+		stfd	f3,24(rd)
+		bdnz	Lmup3
+		blr								//  return to caller
+		
+
+//  ***************************
+//  * M o v e D o w n L o n g *
+//  ***************************
+// 
+//  Subroutine to move 32-byte chunks of memory down (ie, left-to-right.)
+//  The destination is known to be 32-byte aligned, but the source is
+//  *not* necessarily aligned.
+// 
+//  Entry conditions: rs = source ptr (next byte to move)
+// 					rd = dest ptr (next byte to move into)
+// 					rc = count of bytes to move
+// 					cr = crCached set iff destination is cacheable
+// 
+//  Exit conditions:  rs = updated source ptr
+// 					rd = updated destination ptr
+// 					rc = low order 8 bits of count of bytes to move
+// 
+//  Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
+// 
+
+LMoveDownLong:
+		srwi.	r11,rc,5				// r11 <- #32 byte chunks to move
+		mtctr	r11						//  prepare loop count
+		beqlr							//  return if no chunks to move
+		andi.	r0,rs,7					//  is source at least doubleword aligned?
+		beq		Lmdown3					//  yes, can optimize this case
+		mtcrf	0x1,rc					//  save low 8 bits of count
+		mtcrf	0x2,rc					//  (one cr at a time, as 604 prefers)
+
+Lmdown1:									//  loop over each 32-byte-chunk
+		lwz		r0,0(rs)
+		lwz		r5,4(rs)
+		lwz		r6,8(rs)
+		lwz		r7,12(rs)
+		lwz		r8,16(rs)
+		lwz		r9,20(rs)
+		lwz		r10,24(rs)
+		lwz		r11,28(rs)
+		stw		r0,0(rd)
+		stw		r5,4(rd)
+		stw		r6,8(rd)
+		stw		r7,12(rd)
+		stw		r8,16(rd)
+		stw		r9,20(rd)
+		addi	rs,rs,32
+		stw		r10,24(rd)
+		stw		r11,28(rd)
+		addi	rd,rd,32
+		bdnz	Lmdown1
+		mfcr	rc						//  restore low bits of count
+		blr								//  return to caller
+
+//  Aligned operands, so use d.p. floating point registers to move data.
+
+Lmdown3:
+		lfd		f0,0(rs)
+		lfd		f1,8(rs)
+		lfd		f2,16(rs)
+		lfd		f3,24(rs)
+		addi	rs,rs,32
+		stfd	f0,0(rd)
+		stfd	f1,8(rd)
+		stfd	f2,16(rd)
+		stfd	f3,24(rd)
+		addi	rd,rd,32
+		bdnz	Lmdown3
+		blr								//  return to caller
+
+//
+// Register use conventions are as follows:
+//
+// r0 - temp
+// r6 - copy of VMX SPR at entry
+// r7 - temp
+// r8 - constant -1 (also temp and a string op buffer)
+// r9 - constant 16 or -17 (also temp and a string op buffer)
+// r10- constant 32 or -33 (also temp and a string op buffer)
+// r11- constant 48 or -49 (also temp and a string op buffer)
+// r12- chunk count ("c") in long moves
+//
+// v0 - vp - permute vector
+// v1 - va - 1st quadword of source
+// v2 - vb - 2nd quadword of source
+// v3 - vc - 3rd quadword of source
+// v4 - vd - 4th quadword of source
+// v5 - vx - temp
+// v6 - vy - temp
+// v7 - vz - temp
+
+#define vp	v0
+#define va	v1
+#define vb	v2
+#define vc	v3
+#define vd	v4
+#define vx	v5
+#define vy	v6
+#define vz	v7
+
+#define VRSave	256
+
+// kShort should be the crossover point where the long algorithm is faster than the short.
+// WARNING: kShort must be >= 64
+
+// Yes, I know, we just checked rc > 128 to get here...
+
+#define kShort	128
+LAltivec:
+		cmpwi	cr1,rc,kShort		//(1) too short to bother using vector regs?
+		sub.	r0,rd,rs			//(1) must move reverse if (rd-rs)<rc
+		dcbt	0,rs				//(2) prefetch first source block
+		cmplw	cr6,r0,rc			//(2) set cr6 blt iff we must move reverse
+		beqlr-						//(2) done if src==dest
+		srawi.	r9,rc,4				//(3) r9 <- quadwords to move, test for zero
+		or		r8,rs,rd			//(3) start to check for word alignment
+		dcbtst	0,rd				//(4) prefetch first destination block
+		rlwinm	r8,r8,0,30,31		//(4) r8 is zero if word aligned
+		bgt-	cr1,LMoveLong		//(4) handle long operands
+		cmpwi	cr1,r8,0			//(5) word aligned?
+		rlwinm	r7,rc,0,28,31		//(5) r7 <- leftover bytes to move after quadwords
+		bltlr-						//(5) done if negative count
+		blt-	cr6,LShortReverse 	//(5) handle reverse moves
+		cmpwi	cr7,r7,0			//(6) leftover bytes?
+		beq-	Leftovers			//(6) r9==0, so no quadwords to move
+		mtctr	r9					//(7) set up for quadword loop
+		bne-	cr1,LUnalignedLoop	//(7) not word aligned (less common than word aligned)
+
+		
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>                         S H O R T   O P E R A N D S                        <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+		
+LAlignedLoop:						// word aligned operands (the common case)
+		lfd		f0,0(rs)			//(1)
+		lfd		f1,8(rs)			//(2)
+		addi	rs,rs,16			//(2)
+		stfd	f0,0(rd)			//(3)
+		stfd	f1,8(rd)			//(4)
+		addi	rd,rd,16			//(4)
+		bdnz	LAlignedLoop		//(4)
+		
+Leftovers:
+		beqlr-	cr7					//(8) done if r7==0, ie no leftover bytes
+		mtxer	r7					//(9) count of bytes to move (1-15)
+		lswx	r8,0,rs
+		stswx	r8,0,rd
+ 		blr							//(17)
+
+LUnalignedLoop:						// not word aligned, cannot use lfd/stfd
+		lwz		r8,0(rs)			//(1)
+		lwz		r9,4(rs)			//(2)
+		lwz		r10,8(rs)			//(3)
+		lwz		r11,12(rs)			//(4)
+		addi	rs,rs,16			//(4)
+		stw		r8,0(rd)			//(5)
+		stw		r9,4(rd)			//(6)
+		stw		r10,8(rd)			//(7)
+		stw		r11,12(rd)			//(8)
+		addi	rd,rd,16			//(8)
+		bdnz	LUnalignedLoop		//(8)
+		
+		b		Leftovers
+		
+		
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>                   S H O R T   R E V E R S E   M O V E S                    <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+		
+		// cr0 & r9 <- #doublewords to move (>=0)
+		// cr1      <- beq if word aligned
+		//       r7 <- #leftover bytes to move (0-15)
+		
+LShortReverse:
+		cmpwi	cr7,r7,0			// leftover bytes?
+		add		rs,rs,rc			// point 1 past end of string for reverse moves
+		add		rd,rd,rc
+		beq-	LeftoversReverse 	// r9==0, ie no words to move
+		mtctr	r9					// set up for quadword loop
+		bne-	cr1,LUnalignedLoopReverse
+		
+LAlignedLoopReverse:					// word aligned, so use lfd/stfd
+		lfd		f0,-8(rs)
+		lfdu	f1,-16(rs)
+		stfd	f0,-8(rd)
+		stfdu	f1,-16(rd)
+		bdnz	LAlignedLoopReverse
+		
+LeftoversReverse:
+		beqlr-	cr7					// done if r7==0, ie no leftover bytes
+		mtxer	r7					// count of bytes to move (1-15)
+		neg		r7,r7				// index back by #bytes
+		lswx	r8,r7,rs
+		stswx	r8,r7,rd
+		blr
+		
+LUnalignedLoopReverse:				// not word aligned, cannot use lfd/stfd
+		lwz		r8,-4(rs)
+		lwz 	r9,-8(rs)
+		lwz		r10,-12(rs)
+		lwzu	r11,-16(rs)
+		stw		r8,-4(rd)
+		stw		r9,-8(rd)
+		stw		r10,-12(rd)
+		stwu	r11,-16(rd)
+		bdnz	LUnalignedLoopReverse
+		
+		b		LeftoversReverse
+		
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>                          L O N G   O P E R A N D S                         <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+		// cr6 set (blt) if must move reverse
+		// r0 <- (rd - rs)
+			
+LMoveLong:				
+		mfspr	r6,VRSave			//(5) save caller's VMX mask register
+		stw		r6,-4(r1)			// use CR save area so we can use r6 later
+		neg		r8,rd				//(5) start to compute #bytes to fill in 1st dest quadword
+		rlwinm	r0,r0,0,28,31		//(6) start to determine relative alignment
+		andi.	r7,r8,0xF			//(6) r7 <- #bytes to fill in 1st dest quadword
+		cmpwi	cr7,r0,0			//(7) relatively aligned? (ie, 16 bytes apart?)
+		oris	r9,r6,0xFF00		//(7) light bits for regs we use (v0-v7)
+		mtspr	VRSave,r9			//(8) update live register bitmask
+		blt-	cr6,LongReverse		//(8) must move reverse direction
+		sub		rc,rc,r7			//(9) adjust length while we wait
+		beq-	LDest16Aligned		//(9) r7==0, ie destination already quadword aligned
+		
+		// Align destination on a quadword.
+		
+		mtxer	r7					//(10) set up byte count (1-15)
+		lswx	r8,0,rs				// load into r8-r11
+		stswx	r8,0,rd				// store r8-r11 (measured latency on arthur is 7.2 cycles)
+		add		rd,rd,r7			//(18) adjust ptrs
+		add		rs,rs,r7			//(18)
+		
+		// Begin preparation for inner loop and "dst" stream.
+		
+LDest16Aligned:
+        andi.	r0,rd,0x10          //(19) is destination cache-block aligned?
+		li		r9,16				//(19) r9 <- constant used to access 2nd quadword
+		li		r10,32				//(20) r10<- constant used to access 3rd quadword
+		beq-	cr7,LAligned		//(20) handle relatively aligned operands
+		lvx		va,0,rs				//(20) prefetch 1st source quadword
+		li		r11,48				//(21) r11<- constant used to access 4th quadword
+		lvsl	vp,0,rs				//(21) get permute vector to left shift
+		beq		LDest32Aligned		//(22) destination already cache-block aligned
+		
+		// Copy 16 bytes to align destination on 32-byte (cache block) boundary
+		// to maximize store gathering.
+		
+		lvx		vb,r9,rs			//(23) get 2nd source qw
+		subi	rc,rc,16			//(23) adjust count
+		addi	rs,rs,16			//(24) adjust source ptr
+		vperm	vx,va,vb,vp			//(25) vx <- 1st destination qw
+		vor		va,vb,vb			//(25) va <- vb
+		stvx	vx,0,rd				//(26) assuming store Q deep enough to avoid latency
+		addi	rd,rd,16			//(26) adjust dest ptr
+		
+		// Destination 32-byte aligned, source alignment unknown.
+
+LDest32Aligned:
+		srwi.	r12,rc,6			//(27) r12<- count of 64-byte chunks to move
+		rlwinm	r7,rc,28,30,31		//(27) r7 <- count of 16-byte chunks to move
+		cmpwi	cr1,r7,0			//(28) remember if any 16-byte chunks
+		rlwinm	r8,r12,0,26,31		//(29) mask chunk count down to 0-63
+		subi	r0,r8,1				//(30) r8==0?
+		beq-	LNoChunks			//(30) r12==0, ie no chunks to move
+		rlwimi	r8,r0,0,25,25		//(31) if r8==0, then r8 <- 64
+		li		r0,64				//(31) r0 <- used to get 1st quadword of next chunk
+		sub.	r12,r12,r8			//(32) adjust chunk count, set cr0
+		mtctr	r8					//(32) set up loop count
+		li		r8,96				//SKP
+		li		r6,128				//SKP
+		// Inner loop for unaligned sources.  We copy 64 bytes per iteration.
+		// We loop at most 64 times, then reprime the "dst" and loop again for
+		// the next 4KB.  This loop is tuned to keep the CPU flat out, which
+		// means we need to execute a lvx or stvx every cycle.
+		
+LoopBy64:
+		dcbt	rs,r8				//SKP
+		dcbt	rs,r6				//SKP
+		lvx		vb,r9,rs			//(1) 2nd source quadword (1st already in va)
+		lvx		vc,r10,rs			//(2) 3rd
+		lvx		vd,r11,rs			//(3) 4th
+		vperm	vx,va,vb,vp			//(3) vx <- 1st destination quadword
+		lvx		va,rs,r0			//(4) get 1st qw of next 64-byte chunk (r0 must be RB!)
+		vperm	vy,vb,vc,vp			//(4) vy <- 2nd dest qw
+		stvx	vx,0,rd				//(5)
+		vperm	vz,vc,vd,vp			//(5) vz <- 3rd dest qw
+		stvx	vy,r9,rd			//(6)
+		vperm	vx,vd,va,vp			//(6) vx <- 4th
+		stvx	vz,r10,rd			//(7)
+		addi	rs,rs,64			//(7)
+		stvx	vx,r11,rd			//(8)
+		addi	rd,rd,64			//(8)
+		bdnz	LoopBy64			//(8)
+		
+		// End of inner loop.  Should we reprime dst stream and restart loop?
+		// This block is only executed when we're moving more than 4KB.
+		// It is usually folded out because cr0 is set in the loop prologue.
+		
+		beq+	LNoChunks			// r12==0, ie no more chunks to move
+		sub.	r12,r12,r0			// set cr0 if more than 4KB remain to xfer
+		mtctr	r0					// initialize loop count to 64
+		b		LoopBy64			// restart inner loop, xfer another 4KB
+		
+		// Fewer than 64 bytes remain to be moved.
+		
+LNoChunks:							// r7 and cr1 are set with the number of QWs
+		andi.	rc,rc,0xF			//(33) rc <- leftover bytes
+		beq-	cr1,LCleanup		//(33) r7==0, ie fewer than 16 bytes remaining
+		mtctr	r7					//(34) we will loop over 1-3 QWs
+
+LoopBy16:
+		lvx		vb,r9,rs			//(1) vb <- 2nd source quadword
+		addi	rs,rs,16			//(1)
+		vperm	vx,va,vb,vp			//(3) vx <- next destination quadword
+		vor		va,vb,vb			//(3) va <- vb
+		stvx	vx,0,rd				//(4) assuming store Q is deep enough to mask latency
+		addi	rd,rd,16			//(4)
+		bdnz	LoopBy16			//(4)
+		
+		// Move remaining bytes in last quadword.  rc and cr0 have the count.
+		
+LCleanup:
+		lwz		r6,-4(r1)		    // load VRSave from CR save area
+		mtspr	VRSave,r6			//(35) restore caller's live-register bitmask
+		beqlr						//(36) rc==0, ie no leftovers, so done
+		mtxer	rc					//(37) load byte count (1-15)
+		lswx	r8,0,rs
+		stswx	r8,0,rd
+		blr							//(45)
+		
+		
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>              L O N G   A L I G N E D   M O V E S                           <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+		// rs, rd <- both quadword aligned
+		// cr0 <- beq if dest is cache block (32-byte) aligned
+		// r9  <- 16
+		// r10 <- 32
+		
+LAligned:
+		lvx		va,0,rs				// prefetch 1st source quadword
+		li		r11,48				// r11<- constant used to access 4th quadword
+		beq		LAligned32			// destination already cache-block aligned
+		
+		// Copy 16 bytes to align destination on 32-byte (cache block) boundary
+		// to maximize store gathering.
+		
+		subi	rc,rc,16			// adjust count
+		addi	rs,rs,16			// adjust source ptr
+		stvx	va,0,rd				// assuming store Q deep enough to avoid latency
+		addi	rd,rd,16			// adjust dest ptr
+		
+		// Destination 32-byte aligned, source 16-byte aligned.  Set up for inner loop.
+
+LAligned32:
+		srwi.	r12,rc,6			// r12<- count of 64-byte chunks to move
+		rlwinm	r7,rc,28,30,31		// r7 <- count of 16-byte chunks to move
+		cmpwi	cr1,r7,0			// remember if any 16-byte chunks
+		rlwinm	r8,r12,0,26,31		// mask chunk count down to 0-63
+		subi	r0,r8,1				// r8==0?
+		beq-	LAlignedNoChunks	// r12==0, ie no chunks to move
+		rlwimi	r8,r0,0,25,25		// if r8==0, then r8 <- 64
+		li		r0,64				// r0 <- used at end of loop
+		sub.	r12,r12,r8			// adjust chunk count, set cr0
+		mtctr	r8					// set up loop count
+		li		r8,96				//SKP
+		li		r6,128				//SKP
+		
+		// Inner loop for aligned sources.  We copy 64 bytes per iteration.
+		
+LAlignedLoopBy64:
+		dcbt	rs,r8				//SKP
+		dcbt	rs,r6				//SKP
+		lvx		va,0,rs				//(1)
+		lvx		vb,r9,rs			//(2)
+		lvx		vc,r10,rs			//(3)
+		lvx		vd,r11,rs			//(4)
+		addi	rs,rs,64			//(4)
+		stvx	va,0,rd				//(5)
+		stvx	vb,r9,rd			//(6)
+		stvx	vc,r10,rd			//(7)
+		stvx	vd,r11,rd			//(8)
+		addi	rd,rd,64			//(8)
+		bdnz	LAlignedLoopBy64	//(8)
+		
+		// End of inner loop.  Loop again for next 4KB iff any.
+		
+		beq+	LAlignedNoChunks	// r12==0, ie no more chunks to move
+		sub.	r12,r12,r0			// set cr0 if more than 4KB remain to xfer
+		mtctr	r0					// reinitialize loop count to 64
+		b		LAlignedLoopBy64	// restart inner loop, xfer another 4KB
+		
+		// Fewer than 64 bytes remain to be moved.
+		
+LAlignedNoChunks:					// r7 and cr1 are set with the number of QWs
+		andi.	rc,rc,0xF			// rc <- leftover bytes
+		beq-	cr1,LCleanup		// r7==0, ie fewer than 16 bytes remaining
+		mtctr	r7					// we will loop over 1-3 QWs
+
+LAlignedLoopBy16:
+		lvx		va,0,rs				// get next quadword
+		addi	rs,rs,16
+		stvx	va,0,rd
+		addi	rd,rd,16
+		bdnz	LAlignedLoopBy16
+		
+		b		LCleanup			// handle last 0-15 bytes, if any
+
+		
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>              L O N G   R E V E R S E   M O V E S                           <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+		// Reverse moves.  These involve overlapping operands, with the source
+		// lower in memory (lower addresses) than the destination.  They must be
+		// done right-to-left, ie from high addresses down to low addresses.
+		// Throughout this code, we maintain rs and rd as pointers one byte past
+		// the end of the untransferred operands.
+		//
+		// The byte count is >=kShort and the following registers are already loaded:
+		//
+		//	r6  - VMX mask at entry
+		//	cr7 - beq if relatively aligned
+		//
+		
+LongReverse:
+		add		rd,rd,rc			// update source/dest ptrs to be 1 byte past end
+		add		rs,rs,rc
+		andi.	r7,rd,0xF			// r7 <- #bytes needed to move to align destination
+		sub		rc,rc,r7			// adjust length while we wait
+		sub		rs,rs,r7			// adjust ptrs by #bytes to xfer, also while we wait
+		sub		rd,rd,r7
+		beq-	LDest16AlignedReverse
+		
+		// Align destination on a quadword.  Note that we do NOT align on a cache
+		// block boundary for store gathering etc// since all these operands overlap
+		// many dest cache blocks will already be in the L1, so its not clear that
+		// this would be a win.
+		
+		mtxer	r7					// load byte count
+		lswx	r8,0,rs
+		stswx	r8,0,rd
+		
+		// Prepare for inner loop and start "dstst" stream.  Frankly, its not
+		// clear whether "dst" or "dstst" would be better// somebody should
+		// measure.  We use "dstst" because, being overlapped, at least some
+		// source cache blocks will also be stored into.
+		
+LDest16AlignedReverse:
+		srwi.	r12,rc,6			// r12 <- count of 64-byte chunks to move
+		rlwinm	r0,rc,11,9,15		// position quadword count for dst
+		rlwinm	r11,r12,0,26,31		// mask chunk count down to 0-63
+		li		r9,-17				// r9 <- constant used to access 2nd quadword
+		oris	r0,r0,0x0100		// set dst block size to 1 qw
+		li		r10,-33				// r10<- constant used to access 3rd quadword
+		ori		r0,r0,0xFFE0		// set dst stride to -16 bytes
+		li		r8,-1				// r8<- constant used to access 1st quadword
+		dstst	rs,r0,3				// start stream 0
+		subi	r0,r11,1			// r11==0 ?
+		lvx		va,r8,rs			// prefetch 1st source quadword
+		rlwinm	r7,rc,28,30,31		// r7 <- count of 16-byte chunks to move
+		lvsl	vp,0,rs				// get permute vector to right shift
+		cmpwi	cr1,r7,0			// remember if any 16-byte chunks
+		beq-	LNoChunksReverse	// r12==0, so skip inner loop
+		rlwimi	r11,r0,0,25,25		// if r11==0, then r11 <- 64
+		sub.	r12,r12,r11			// adjust chunk count, set cr0
+		mtctr	r11					// set up loop count
+		li		r11,-49				// r11<- constant used to access 4th quadword
+		li		r0,-64				// r0 <- used for several purposes
+		beq-	cr7,LAlignedLoopBy64Reverse
+		
+		// Inner loop for unaligned sources.  We copy 64 bytes per iteration.
+
+LoopBy64Reverse:
+		lvx		vb,r9,rs			//(1) 2nd source quadword (1st already in va)
+		lvx		vc,r10,rs			//(2) 3rd quadword
+		lvx		vd,r11,rs			//(3) 4th
+		vperm	vx,vb,va,vp			//(3) vx <- 1st destination quadword
+		lvx		va,rs,r0			//(4) get 1st qw of next 64-byte chunk (note r0 must be RB)
+		vperm	vy,vc,vb,vp			//(4) vy <- 2nd dest qw
+		stvx	vx,r8,rd			//(5)
+		vperm	vz,vd,vc,vp			//(5) vz <- 3rd destination quadword
+		stvx	vy,r9,rd			//(6)
+		vperm	vx,va,vd,vp			//(6) vx <- 4th qw
+		stvx	vz,r10,rd			//(7)
+		subi	rs,rs,64			//(7)
+		stvx	vx,r11,rd			//(8)
+		subi	rd,rd,64			//(8)
+		bdnz	LoopBy64Reverse		//(8)
+		
+		// End of inner loop.  Should we reprime dst stream and restart loop?
+		// This block is only executed when we're moving more than 4KB.
+		// It is usually folded out because cr0 is set in the loop prologue.
+		
+		beq+	LNoChunksReverse	// r12==0, ie no more chunks to move
+		lis		r8,0x0440			// dst control: 64 4-qw blocks
+		add.	r12,r12,r0			// set cr0 if more than 4KB remain to xfer
+		ori		r8,r8,0xFFC0		// stride is -64 bytes
+		dstst	rs,r8,3				// restart the prefetch stream
+		li		r8,64				// inner loop count
+		mtctr	r8					// initialize loop count to 64
+		li		r8,-1				// restore qw1 offset for inner loop
+		b		LoopBy64Reverse		// restart inner loop, xfer another 4KB
+		
+		// Fewer than 64 bytes remain to be moved.
+		
+LNoChunksReverse:					// r7 and cr1 are set with the number of QWs
+		andi.	rc,rc,0xF			// rc <- leftover bytes
+		beq-	cr1,LCleanupReverse	// r7==0, ie fewer than 16 bytes left
+		mtctr	r7
+		beq-	cr7,LAlignedLoopBy16Reverse
+
+LoopBy16Reverse:
+		lvx		vb,r9,rs			// vb <- 2nd source quadword
+		subi	rs,rs,16
+		vperm	vx,vb,va,vp			// vx <- next destination quadword
+		vor		va,vb,vb			// va <- vb
+		stvx	vx,r8,rd
+		subi	rd,rd,16
+		bdnz	LoopBy16Reverse
+		
+		// Fewer that 16 bytes remain to be moved.
+		
+LCleanupReverse:					// rc and cr0 set with remaining byte count
+		lwz		r6,-4(r1)			// load VRSave from CR save area
+		mtspr	VRSave,r6			// restore caller's live-register bitmask
+		beqlr						// rc==0, ie no leftovers so done
+		neg		r7,rc				// get -(#bytes)
+		mtxer	rc					// byte count
+		lswx	r8,r7,rs
+		stswx	r8,r7,rd
+		blr
+
+		
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>        A L I G N E D   L O N G   R E V E R S E   M O V E S                 <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+		// Inner loop.  We copy 64 bytes per iteration.
+
+LAlignedLoopBy64Reverse:
+		lvx		va,r8,rs			//(1)
+		lvx		vb,r9,rs			//(2)
+		lvx		vc,r10,rs			//(3)
+		lvx		vd,r11,rs			//(4) 
+		subi	rs,rs,64			//(4)
+		stvx	va,r8,rd			//(5)
+		stvx	vb,r9,rd			//(6)
+		stvx	vc,r10,rd			//(7)
+		stvx	vd,r11,rd			//(8)
+		subi	rd,rd,64			//(8)
+		bdnz	LAlignedLoopBy64Reverse //(8)
+		
+		// End of inner loop.  Loop for next 4KB iff any.
+		
+		beq+	LNoChunksReverse	// r12==0, ie no more chunks to move
+		lis		r8,0x0440			// dst control: 64 4-qw blocks
+		add.	r12,r12,r0			// r12 <- r12 - 64, set cr0
+		ori		r8,r8,0xFFC0		// stride is -64 bytes
+		dstst	rs,r8,3				// restart the prefetch stream
+		li		r8,64				// inner loop count
+		mtctr	r8					// initialize loop count to 64
+		li		r8,-1				// restore qw1 offset for inner loop
+		b		LAlignedLoopBy64Reverse
+
+		// Loop to copy leftover quadwords (1-3).
+		
+LAlignedLoopBy16Reverse:
+		lvx		va,r8,rs			// get next qw
+		subi	rs,rs,16
+		stvx	va,r8,rd
+		subi	rd,rd,16
+		bdnz	LAlignedLoopBy16Reverse
+		
+		b		LCleanupReverse		// handle up to 15 bytes in last qw
diff --git a/gen.subproj/ppc.subproj/memcpy.s b/gen.subproj/ppc.subproj/memcpy.s
deleted file mode 100644
index 0c371f6..0000000
--- a/gen.subproj/ppc.subproj/memcpy.s
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- */
-#define MEMCPY
-#include "bcopy.s"
diff --git a/gen.subproj/ppc.subproj/memmove.s b/gen.subproj/ppc.subproj/memmove.s
deleted file mode 100644
index d517786..0000000
--- a/gen.subproj/ppc.subproj/memmove.s
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- */
-#define MEMMOVE
-#include "bcopy.s"
diff --git a/gen.subproj/scalable_malloc.c b/gen.subproj/scalable_malloc.c
index cbf6ab8..a19c7a4 100644
--- a/gen.subproj/scalable_malloc.c
+++ b/gen.subproj/scalable_malloc.c
@@ -76,7 +76,8 @@ If 0 then the block is either free (in which case the size is directly at the bl
 
 #define PROTECT_SMALL			0	// Should be 0: 1 is too slow for normal use
 
-#define LARGE_CACHE_SIZE	4	// define hysterisis of large chunks
+#define LARGE_CACHE_SIZE	1	// define hysterisis of large chunks
+#define MAX_LARGE_SIZE_TO_CACHE       (128*1024)  /* blocks larger than this are not cached */
 
 #define MAX_RECORDER_BUFFER	256
 
@@ -149,6 +150,7 @@ static size_t szone_good_size(szone_t *szone, size_t size);
 static boolean_t szone_check_all(szone_t *szone, const char *function);
 static void szone_print(szone_t *szone, boolean_t verbose);
 static INLINE region_t *region_for_ptr_no_lock(szone_t *szone, const void *ptr);
+static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry);
 
 #define LOG(szone,ptr)	(szone->log_address && (szone->num_small_objects > 8) && (((unsigned)szone->log_address == -1) || (szone->log_address == (void *)(ptr))))
 
@@ -931,11 +933,9 @@ static void large_entries_grow_no_lock(szone_t *szone) {
 }
 
 static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry) {
-    // enters the specified large entry into the cache of freed entries
-    // returns a range to truly deallocate
-    vm_range_t		vm_range_to_deallocate;
+    // frees the specific entry in the size table
+    // returns a range to truly deallocate, taking into account
     vm_range_t		range;
-    vm_range_t		*range_to_use;
     range.address = LARGE_ENTRY_ADDRESS(*entry);
     range.size = LARGE_ENTRY_SIZE(*entry);
     szone->num_large_objects_in_use --;
@@ -956,6 +956,18 @@ static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry) {
         sleep(3600);
     }
 #endif
+    return range;
+}
+
+static vm_range_t large_find_better_range_to_deallocate(szone_t *szone, vm_range_t range) {
+    // enters the specified large entry into the cache of freed entries
+    // returns a range to truly deallocate
+    vm_range_t		*range_to_use;
+    vm_range_t		vm_range_to_deallocate;
+    
+    // if the specified range in larger than MAX_LARGE_SIZE_TO_CACHE the range is not cached 
+    if (range.size > MAX_LARGE_SIZE_TO_CACHE) return range;
+
     range = coalesce_range(szone->large_to_deallocate, LARGE_CACHE_SIZE, range);
     range_to_use = first_zero_range(szone->large_to_deallocate, LARGE_CACHE_SIZE);
     if (range_to_use) {
@@ -1185,6 +1197,7 @@ static void szone_free(szone_t *szone, void *ptr) {
 	    vm_msync(mach_task_self(), LARGE_ENTRY_ADDRESS(*entry), LARGE_ENTRY_SIZE(*entry), VM_SYNC_KILLPAGES);
 	}
         vm_range_to_deallocate = large_free_no_lock(szone, entry);
+	vm_range_to_deallocate = large_find_better_range_to_deallocate(szone, vm_range_to_deallocate);
 #if DEBUG_MALLOC
         if (large_entry_for_pointer_no_lock(szone, ptr)) {
             malloc_printf("*** malloc[%d]: Just after freeing 0x%x still in use num_large_entries=%d\n", getpid(), ptr, szone->num_large_entries);
@@ -1386,12 +1399,27 @@ static void *szone_realloc(szone_t *szone, void *ptr, size_t new_size) {
 	if (szone_try_realloc_in_place(szone, ptr, old_size, new_size)) return ptr;
     }
     newPtr = szone_malloc(szone, new_size);
-    if (old_size > VM_COPY_THRESHOLD) {
+    if ((old_size > VM_COPY_THRESHOLD) && (old_size < (1 << (vm_page_shift + vm_page_shift)))) {
+	// we know it's a large block, and not a huge block
         kern_return_t	err = 0;
         err = vm_copy(mach_task_self(), (vm_address_t)ptr, old_size, (vm_address_t)newPtr);
         if (err) {
             szone_error(szone, "Can't vm_copy region", ptr);
-        }
+        } else {
+	    large_entry_t	*entry;
+	    vm_range_t		range;
+	    SZONE_LOCK(szone);
+	    entry = large_entry_for_pointer_no_lock(szone, ptr);
+	    if (!entry) {
+		szone_error(szone, "Can't find entry for large copied block", ptr);
+	    }
+	    range = large_free_no_lock(szone, entry);
+	    SZONE_UNLOCK(szone); // we release the lock asap
+	    // we truly deallocate_pages, including guard pages
+	    deallocate_pages(szone, range.address, range.size, 0);
+	    if (LOG(szone, ptr)) malloc_printf("szone_realloc returned %p for %d\n", newPtr, (unsigned)new_size);
+	    return newPtr;
+	}
     } else {
         memcpy(newPtr, ptr, old_size);
     }
diff --git a/locale.subproj/rune.c b/locale.subproj/rune.c
index 2325a7e..631c815 100644
--- a/locale.subproj/rune.c
+++ b/locale.subproj/rune.c
@@ -92,7 +92,7 @@ setrunelocale(encoding)
 		return(0);
 	}
 
-	if (!PathLocale && !(PathLocale = getenv("PATH_LOCALE")))
+	if (!PathLocale)
 		PathLocale = _PATH_LOCALE;
 
 	sprintf(name, "%s/%s/LC_CTYPE", PathLocale, encoding);
diff --git a/locale.subproj/setlocale.c b/locale.subproj/setlocale.c
index 8011e68..7dc8b93 100644
--- a/locale.subproj/setlocale.c
+++ b/locale.subproj/setlocale.c
@@ -105,7 +105,7 @@ setlocale(category, locale)
 	int found, i, len;
 	char *env, *r;
 
-	if (!PathLocale && !(PathLocale = getenv("PATH_LOCALE")))
+	if (!PathLocale)
 		PathLocale = _PATH_LOCALE;
 
 	if (category < 0 || category >= _LC_LAST)
diff --git a/mach.subproj/mach_init.c b/mach.subproj/mach_init.c
index 6fac3a0..2d3bd9c 100644
--- a/mach.subproj/mach_init.c
+++ b/mach.subproj/mach_init.c
@@ -123,7 +123,7 @@ int mach_init_doit(int forkchild)
 		_atfork_child_routine = mach_atfork_child_routine;
                 _pthread_set_self(0);
                 cthread_set_self(0);
- 	}
+	}
 
 	/*
 	 *	Initialize the single mig reply port
@@ -209,11 +209,11 @@ int fork_mach_init()
 mach_port_t
 mach_task_self()
 {
-	return(mach_task_self_);
+	return(task_self_trap());
 }
 
 mach_port_t
 mach_thread_self()
 {
 	return(thread_self_trap());
-}
\ No newline at end of file
+}
diff --git a/pthreads.subproj/pthread.c b/pthreads.subproj/pthread.c
index ddf28a8..3a927bd 100644
--- a/pthreads.subproj/pthread.c
+++ b/pthreads.subproj/pthread.c
@@ -55,8 +55,10 @@ extern pthread_lock_t reply_port_lock;
  */
 
 size_t _pthread_stack_size = 0;
-int _spin_tries = 1;
+int _spin_tries = 0;
+#if !defined(__ppc__)
 int _cpu_has_altivec = 0;
+#endif
 
 /* This global should be used (carefully) by anyone needing to know if a pthread has been
 ** created.
@@ -105,14 +107,6 @@ extern mach_port_t thread_recycle_port;
 
 #endif
 
-/* This is the struct used to recycle (or terminate) a thread */
-/* We stash the thread port into the reply port of the message */
-
-typedef struct {
-	mach_msg_header_t header;
-	mach_msg_trailer_t trailer;
-} recycle_msg_t;
-
 /* Set the base address to use as the stack pointer, before adjusting due to the ABI */
 
 static int
@@ -514,12 +508,6 @@ pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize)
     }
 }
 
-pthread_t _cachedThread = (pthread_t)0;
-
-void _clear_thread_cache(void) {
-    _cachedThread = (pthread_t)0;
-}
-
 /*
  * Create and start execution of a new thread.
  */
@@ -527,7 +515,6 @@ void _clear_thread_cache(void) {
 static void
 _pthread_body(pthread_t self)
 {
-    _clear_thread_cache();
     _pthread_set_self(self);
     pthread_exit((self->fun)(self->arg));
 }
@@ -721,9 +708,9 @@ pthread_detach(pthread_t thread)
 			thread->death = MACH_PORT_NULL;
 			UNLOCK(thread->lock);
 			if (num_joiners > 0)
-			{ /* Have to tell these guys this thread can't be joined with */
-				swtch_pri(0);
-				PTHREAD_MACH_CALL(semaphore_signal_all(thread->joiners), kern_res);
+			{
+				/* Wake up a joiner */
+				PTHREAD_MACH_CALL(semaphore_signal(thread->joiners), kern_res);
 			}
 			/* Destroy 'control' semaphores */
 			PTHREAD_MACH_CALL(semaphore_destroy(mach_task_self(),
@@ -731,6 +718,10 @@ pthread_detach(pthread_t thread)
 			PTHREAD_MACH_CALL(semaphore_destroy(mach_task_self(),
 						    death), kern_res);
 			return (ESUCCESS);
+		} else if (thread->detached == _PTHREAD_EXITED) {
+			UNLOCK(thread->lock);
+			pthread_join(thread, NULL);
+			return ESUCCESS;
 		} else
 		{
 			UNLOCK(thread->lock);
@@ -748,16 +739,20 @@ pthread_detach(pthread_t thread)
 /* terminated, it will be yanked out from under the mach_msg() call. */
 
 static void _pthread_become_available(pthread_t thread) {
-	recycle_msg_t msg = { { 0 } };
+	mach_msg_empty_rcv_t msg = { { 0 } };
 	kern_return_t ret;
 
+	if (thread->reply_port == MACH_PORT_NULL) {
+		thread->reply_port = mach_reply_port();
+	}
 	msg.header.msgh_size = sizeof msg - sizeof msg.trailer;
 	msg.header.msgh_remote_port = thread_recycle_port;
 	msg.header.msgh_local_port = MACH_PORT_NULL; 
 	msg.header.msgh_id = (int)thread;
 	msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0);
-	ret = mach_msg(&msg.header, MACH_SEND_MSG, msg.header.msgh_size, 0,
-			MACH_PORT_NULL, MACH_MSG_TIMEOUT_NONE,
+	ret = mach_msg(&msg.header, MACH_SEND_MSG | MACH_RCV_MSG,
+			msg.header.msgh_size, sizeof msg,
+			thread->reply_port, MACH_MSG_TIMEOUT_NONE,
 			MACH_PORT_NULL);
 	while (1) {
 		ret = thread_suspend(thread->kernel_thread);
@@ -767,17 +762,17 @@ static void _pthread_become_available(pthread_t thread) {
 
 /* Check to see if any threads are available. Return immediately */
 
-static kern_return_t _pthread_check_for_available_threads(recycle_msg_t *msg) {
+static kern_return_t _pthread_check_for_available_threads(mach_msg_empty_rcv_t *msg) {
 	return mach_msg(&msg->header, MACH_RCV_MSG|MACH_RCV_TIMEOUT, 0,
-			sizeof(recycle_msg_t), thread_recycle_port, 0,
+			sizeof(mach_msg_empty_rcv_t), thread_recycle_port, 0,
 			MACH_PORT_NULL);
 }
 
 /* Terminate all available threads and deallocate their stacks */
 static void _pthread_reap_threads(void) {
 	kern_return_t ret;
-	recycle_msg_t msg = { { 0 } };
-	while(_pthread_check_for_available_threads(&msg) == KERN_SUCCESS) {
+	mach_msg_empty_rcv_t msg = { { 0 } };
+	while((ret = _pthread_check_for_available_threads(&msg)) == KERN_SUCCESS) {
 		pthread_t th = (pthread_t)msg.header.msgh_id;
 		mach_port_t kernel_thread = th->kernel_thread;
 		mach_port_t reply_port = th->reply_port; 
@@ -807,31 +802,14 @@ static void _pthread_reap_threads(void) {
 		}
 		free(th);
 	}
+	assert(ret == MACH_RCV_TIMED_OUT);
 }
 
-
-static void *
-stackAddress(void)
-{
-    unsigned dummy;
-    return (void *)((unsigned)&dummy & ~ (PTHREAD_STACK_MIN - 1));
-}
-
-extern pthread_t _pthread_self(void);
+/* For compatibility... */
 
 pthread_t
-pthread_self(void)
-{
-    void * myStack = (void *)0;
-    pthread_t cachedThread = _cachedThread;
-    if (cachedThread) {
-        myStack = stackAddress();
-        if ((void *)((unsigned)(cachedThread->stackaddr - 1) & ~ (PTHREAD_STACK_MIN - 1)) == myStack) {
-            return cachedThread;
-        }
-    }
-    _cachedThread = _pthread_self();
-    return _cachedThread;
+_pthread_self() {
+	return pthread_self();
 }
 
 /*
@@ -844,7 +822,6 @@ pthread_exit(void *value_ptr)
         struct _pthread_handler_rec *handler;
 	kern_return_t kern_res;
 	int num_joiners;
-    _clear_thread_cache();
 	while ((handler = self->cleanup_stack) != 0)
 	{
 		(handler->routine)(handler->arg);
@@ -860,10 +837,14 @@ pthread_exit(void *value_ptr)
 		UNLOCK(self->lock);
 		if (num_joiners > 0)
 		{
-			swtch_pri(0);
-			PTHREAD_MACH_CALL(semaphore_signal_all(self->joiners), kern_res);
+			/* POSIX says that multiple pthread_join() calls on */
+			/* the same thread are undefined so we just wake up */
+			/* the first one to join */
+			PTHREAD_MACH_CALL(semaphore_signal(self->joiners), kern_res);
 		}
-		PTHREAD_MACH_CALL(semaphore_wait(self->death), kern_res);
+		do {
+			PTHREAD_MACH_CALL(semaphore_wait(self->death), kern_res);
+		} while (kern_res == KERN_ABORTED);
 	} else
 		UNLOCK(self->lock);
 	/* Destroy thread & reclaim resources */
@@ -896,7 +877,9 @@ pthread_join(pthread_t thread,
 		{
 			thread->num_joiners++;
 			UNLOCK(thread->lock);
-			PTHREAD_MACH_CALL(semaphore_wait(thread->joiners), kern_res);
+			do {
+				PTHREAD_MACH_CALL(semaphore_wait(thread->joiners), kern_res);
+			 } while (kern_res == KERN_ABORTED);
 			LOCK(thread->lock);
 			thread->num_joiners--;
 		}
@@ -909,7 +892,6 @@ pthread_join(pthread_t thread,
 					*value_ptr = thread->exit_value;
 				}
 				UNLOCK(thread->lock);
-				swtch_pri(0);
 				PTHREAD_MACH_CALL(semaphore_signal(thread->death), kern_res);
 				return (ESUCCESS);
 			} else
@@ -1183,14 +1165,10 @@ pthread_init(void)
 	}
 	attrs = &_attr;
 	pthread_attr_init(attrs);
-    _clear_thread_cache();
-    _pthread_set_self(&_thread);
+	_pthread_set_self(&_thread);
 
         _pthread_create(&_thread, attrs, USRSTACK, mach_thread_self());
-        thread = (pthread_t)malloc(sizeof(struct _pthread));
-	memcpy(thread, &_thread, sizeof(struct _pthread));
-    _clear_thread_cache();
-        _pthread_set_self(thread);
+        thread = &_thread;
         thread->detached = _PTHREAD_CREATE_PARENT;
 
         /* See if we're on a multiprocessor and set _spin_tries if so.  */
@@ -1199,7 +1177,7 @@ pthread_init(void)
 	len = sizeof(numcpus);
 	if (sysctl(mib, 2, &numcpus, &len, NULL, 0) == 0) {
 		if (numcpus > 1) {
-			_spin_tries = SPIN_TRIES;
+			_spin_tries = MP_SPIN_TRIES;
 		}
 	} else {
 		count = HOST_BASIC_INFO_COUNT;
@@ -1210,7 +1188,7 @@ pthread_init(void)
 			printf("host_info failed (%d)\n", kr);
 		else {
 			if (basic_info.avail_cpus > 1)
-				_spin_tries = SPIN_TRIES;
+				_spin_tries = MP_SPIN_TRIES;
 			/* This is a crude test */
 			if (basic_info.cpu_subtype >= CPU_SUBTYPE_POWERPC_7400) 
 				_cpu_has_altivec = 1;
diff --git a/pthreads.subproj/pthread_cond.c b/pthreads.subproj/pthread_cond.c
index ae6ef05..4eee4a5 100644
--- a/pthreads.subproj/pthread_cond.c
+++ b/pthreads.subproj/pthread_cond.c
@@ -295,7 +295,8 @@ _pthread_cond_wait(pthread_cond_t *cond,
     if ((res = pthread_mutex_lock(mutex)) != ESUCCESS) {
         return (res);
     }
-    if (kern_res == KERN_SUCCESS) {
+    /* KERN_ABORTED can be treated as a spurious wakeup */
+    if ((kern_res == KERN_SUCCESS) || (kern_res == KERN_ABORTED)) {
         return (ESUCCESS);
     } else if (kern_res == KERN_OPERATION_TIMED_OUT) {
         return (ETIMEDOUT);
diff --git a/pthreads.subproj/pthread_internals.h b/pthreads.subproj/pthread_internals.h
index 1e96b3c..2cfde61 100644
--- a/pthreads.subproj/pthread_internals.h
+++ b/pthreads.subproj/pthread_internals.h
@@ -198,24 +198,33 @@ extern boolean_t swtch_pri(int);
 
 /* Number of times to spin when the lock is unavailable and we are on a
    multiprocessor.  On a uniprocessor we yield the processor immediately.  */
-#define SPIN_TRIES 10
+#define	MP_SPIN_TRIES	1000
 extern int _spin_tries;
 extern int __is_threaded;
 extern int _cpu_has_altivec;
 
 /* Internal mutex locks for data structures */
-#define TRY_LOCK(v) (!__is_threaded || _spin_lock_try((pthread_lock_t *)&v))
-#if 0
-#define LOCK(v) if (__is_threaded) _spin_lock((pthread_lock_t)&v)
-#else
-#define LOCK(v) \
-        if (__is_threaded) { \
-		while (!_spin_lock_try((pthread_lock_t *)&v)) {	\
-		    syscall_thread_switch(THREAD_NULL, SWITCH_OPTION_WAIT, 1); \
-		} \
-	}
-#endif
-#define UNLOCK(v) if (__is_threaded) _spin_unlock((pthread_lock_t *)&v)
+#define TRY_LOCK(v) (!__is_threaded || _spin_lock_try((pthread_lock_t *)&(v)))
+#define LOCK(v)																\
+do {																		\
+	if (__is_threaded) {													\
+		int		tries = _spin_tries;										\
+																			\
+		while (!_spin_lock_try((pthread_lock_t *)&(v))) {					\
+			if (tries-- > 0)												\
+				continue;													\
+																			\
+			syscall_thread_switch(THREAD_NULL, SWITCH_OPTION_DEPRESS, 1);	\
+			tries = _spin_tries;											\
+		}																	\
+	}																		\
+} while (0)
+#define UNLOCK(v)								\
+do {											\
+	if (__is_threaded)							\
+		_spin_unlock((pthread_lock_t *)&(v));	\
+} while (0)
+
 #ifndef ESUCCESS
 #define ESUCCESS 0
 #endif
diff --git a/pthreads.subproj/pthread_mutex.c b/pthreads.subproj/pthread_mutex.c
index 1276e60..427026a 100644
--- a/pthreads.subproj/pthread_mutex.c
+++ b/pthreads.subproj/pthread_mutex.c
@@ -141,7 +141,9 @@ pthread_mutex_lock(pthread_mutex_t *mutex)
 			mutex->sem = new_sem_from_pool();
 		}
                 UNLOCK(mutex->lock);
-		PTHREAD_MACH_CALL(semaphore_wait(mutex->sem), kern_res);
+		do {
+			PTHREAD_MACH_CALL(semaphore_wait(mutex->sem), kern_res);
+		} while (kern_res == KERN_ABORTED);
                 LOCK(mutex->lock);
 		mutex->waiters--;
 		if (mutex->waiters == 0) {
diff --git a/stdio.subproj/vfprintf.c b/stdio.subproj/vfprintf.c
index f819e25..2e49b6b 100644
--- a/stdio.subproj/vfprintf.c
+++ b/stdio.subproj/vfprintf.c
@@ -276,7 +276,7 @@ __uqtoa(val, endp, base, octzero, xdigs)
 #define	BUF		(MAXEXP+MAXFRACT+1)	/* + decimal point */
 #define	DEFPREC		6
 
-static char *cvt __P((double, int, int, char *, int *, int, int *));
+static char *cvt __P((double, int, int, char *, int *, int, int *, char **));
 static int exponent __P((char *, int, int));
 
 #else /* no FLOATING_POINT */
@@ -322,6 +322,7 @@ vfprintf(fp, fmt0, ap)
 	int expsize = 0;	/* character count for expstr */
 	int ndig;		/* actual number of digits returned by cvt */
 	char expstr[7];		/* buffer for exponent string */
+	char *dtoaresult;	/* buffer allocated by dtoa */
 #endif
 	u_long	ulval = 0;	/* integer arguments %[diouxX] */
 	u_quad_t uqval = 0;	/* %q integers */
@@ -428,8 +429,9 @@ vfprintf(fp, fmt0, ap)
         } else { \
 		val = GETARG (int); \
         }
-        
-
+#ifdef FLOATING_POINT
+	dtoaresult = NULL;
+#endif
 	/* FLOCKFILE(fp); */
 	/* sorry, fprintf(read_only_file, "") returns EOF, not 0 */
 	if (cantwrite(fp)) {
@@ -621,7 +623,7 @@ fp_begin:		if (prec == -1)
 			}
 			flags |= FPT;
 			cp = cvt(_double, prec, flags, &softsign,
-				&expt, ch, &ndig);
+				&expt, ch, &ndig, &dtoaresult);
 			if (ch == 'g' || ch == 'G') {
 				if (expt <= -4 || expt > prec)
 					ch = (ch == 'g') ? 'e' : 'E';
@@ -877,6 +879,10 @@ number:			if ((dprec = prec) >= 0)
 done:
 	FLUSH();
 error:
+#ifdef FLOATING_POINT
+	if (dtoaresult != NULL)
+		free(dtoaresult);
+#endif
 	if (__sferror(fp))
 		ret = EOF;
 	/* FUNLOCKFILE(fp); */
@@ -911,7 +917,7 @@ error:
  * Find all arguments when a positional parameter is encountered.  Returns a
  * table, indexed by argument number, of pointers to each arguments.  The
  * initial argument table should be an array of STATIC_ARG_TBL_SIZE entries.
- * It will be replaces with a malloc-ed on if it overflows.
+ * It will be replaces with a malloc-ed one if it overflows.
  */ 
 static void
 __find_arguments (fmt0, ap, argtable)
@@ -937,8 +943,8 @@ __find_arguments (fmt0, ap, argtable)
 #define ADDTYPE(type) \
 	((nextarg >= tablesize) ? \
 		__grow_type_table(nextarg, &typetable, &tablesize) : 0, \
-	typetable[nextarg++] = type, \
-	(nextarg > tablemax) ? tablemax = nextarg : 0)
+	(nextarg > tablemax) ? tablemax = nextarg : 0, \
+	typetable[nextarg++] = type)
 
 #define	ADDSARG() \
 	((flags&LONGINT) ? ADDTYPE(T_LONG) : \
@@ -1191,33 +1197,38 @@ __grow_type_table (nextarg, typetable, tablesize)
 	unsigned char **typetable;
 	int *tablesize;
 {
-	unsigned char *oldtable = *typetable;
-	int newsize = *tablesize * 2;
-
-	if (*tablesize == STATIC_ARG_TBL_SIZE) {
-		*typetable = (unsigned char *)
-		    malloc (sizeof (unsigned char) * newsize);
-		bcopy (oldtable, *typetable, *tablesize);
+	unsigned char *const oldtable = *typetable;
+	const int oldsize = *tablesize;
+	unsigned char *newtable;
+	int newsize = oldsize * 2;
+
+	if (newsize < nextarg + 1)
+		newsize = nextarg + 1;
+	if (oldsize == STATIC_ARG_TBL_SIZE) {
+		if ((newtable = malloc (newsize)) == NULL)
+			abort();	/* XXX handle better */
+		bcopy (oldtable, newtable, oldsize);
 	} else {
-		*typetable = (unsigned char *)
-		    realloc (typetable, sizeof (unsigned char) * newsize);
-
+		if ((newtable = realloc (oldtable, newsize)) == NULL)
+			abort();	/* XXX handle better */
 	}
-	memset (&typetable [*tablesize], T_UNUSED, (newsize - *tablesize));
+	memset (&newtable [oldsize], T_UNUSED, (newsize - oldsize));
 
+	*typetable = newtable;
 	*tablesize = newsize;
 }
 
 
 #ifdef FLOATING_POINT
 
-extern char *__dtoa __P((double, int, int, int *, int *, char **));
+extern char *__dtoa __P((double, int, int, int *, int *, char **, char **));
 
 static char *
-cvt(value, ndigits, flags, sign, decpt, ch, length)
+cvt(value, ndigits, flags, sign, decpt, ch, length, dtoaresultp)
 	double value;
 	int ndigits, flags, *decpt, ch, *length;
 	char *sign;
+	char **dtoaresultp;
 {
 	int mode, dsgn;
 	char *digits, *bp, *rve;
@@ -1239,7 +1250,7 @@ cvt(value, ndigits, flags, sign, decpt, ch, length)
 		*sign = '-';
 	} else
 		*sign = '\000';
-	digits = __dtoa(value, mode, ndigits, decpt, &dsgn, &rve);
+	digits = __dtoa(value, mode, ndigits, decpt, &dsgn, &rve, dtoaresultp);
 	if ((ch != 'g' && ch != 'G') || flags & ALT) {
 		/* print trailing zeros */
 		bp = digits + ndigits;
diff --git a/stdio.subproj/vfscanf.c b/stdio.subproj/vfscanf.c
index c245bfc..a12e167 100644
--- a/stdio.subproj/vfscanf.c
+++ b/stdio.subproj/vfscanf.c
@@ -80,6 +80,7 @@
 #define	SUPPRESS	0x08	/* suppress assignment */
 #define	POINTER		0x10	/* weird %p pointer (`fake hex') */
 #define	NOSKIP		0x20	/* do not skip blanks */
+#define QUAD            0x400
 
 /*
  * The following are used in numeric conversions only:
@@ -101,13 +102,13 @@
 #define	CT_CHAR		0	/* %c conversion */
 #define	CT_CCL		1	/* %[...] conversion */
 #define	CT_STRING	2	/* %s conversion */
-#define	CT_INT		3	/* integer, i.e., strtol or strtoul */
+#define	CT_INT		3	/* integer, i.e., strtoq or strtouq */
 #define	CT_FLOAT	4	/* floating, i.e., strtod */
 
 #define u_char unsigned char
 #define u_long unsigned long
 
-static u_char *__sccl();
+static u_char *__sccl(char *, u_char *);
 
 /*
  * vfscanf
@@ -127,8 +128,8 @@ __svfscanf(fp, fmt0, ap)
 	register char *p0;	/* saves original value of p when necessary */
 	int nassigned;		/* number of fields assigned */
 	int nread;		/* number of characters consumed from fp */
-	int base;		/* base argument to strtol/strtoul */
-	u_long (*ccfn)();	/* conversion function (strtol/strtoul) */
+	int base;		/* base argument to strtoq/strtouq */
+	u_quad_t (*ccfn)();	/* conversion function (strtoq/strtouq) */
 	char ccltab[256];	/* character class table for %[...] */
 	char buf[BUF];		/* buffer for numeric conversions */
 
@@ -180,6 +181,9 @@ literal:
 		case 'l':
 			flags |= LONG;
 			goto again;
+		case 'q':
+			flags |= QUAD;
+			goto again;
 		case 'L':
 			flags |= LONGDBL;
 			goto again;
@@ -204,13 +208,13 @@ literal:
 			/* FALLTHROUGH */
 		case 'd':
 			c = CT_INT;
-			ccfn = (u_long (*)())strtol;
+			ccfn = (u_quad_t (*)())strtoq;
 			base = 10;
 			break;
 
 		case 'i':
 			c = CT_INT;
-			ccfn = (u_long (*)())strtol;
+			ccfn = (u_quad_t (*)())strtoq;
 			base = 0;
 			break;
 
@@ -219,13 +223,13 @@ literal:
 			/* FALLTHROUGH */
 		case 'o':
 			c = CT_INT;
-			ccfn = strtoul;
+			ccfn = strtouq;
 			base = 8;
 			break;
 
 		case 'u':
 			c = CT_INT;
-			ccfn = strtoul;
+			ccfn = strtouq;
 			base = 10;
 			break;
 
@@ -235,7 +239,7 @@ literal:
 		case 'x':
 			flags |= PFXOK;	/* enable 0x prefixing */
 			c = CT_INT;
-			ccfn = strtoul;
+			ccfn = strtouq;
 			base = 16;
 			break;
 
@@ -267,7 +271,7 @@ literal:
 		case 'p':	/* pointer format is like hex */
 			flags |= POINTER | PFXOK;
 			c = CT_INT;
-			ccfn = strtoul;
+			ccfn = strtouq;
 			base = 16;
 			break;
 
@@ -278,6 +282,8 @@ literal:
 				*va_arg(ap, short *) = nread;
 			else if (flags & LONG)
 				*va_arg(ap, long *) = nread;
+			else if (flags & QUAD)
+				*va_arg(ap, quad_t *) = nread;
 			else
 				*va_arg(ap, int *) = nread;
 			continue;
@@ -292,7 +298,7 @@ literal:
 			if (isupper(c))
 				flags |= LONG;
 			c = CT_INT;
-			ccfn = (u_long (*)())strtol;
+			ccfn = (u_quad_t (*)())strtoq;
 			base = 10;
 			break;
 		}
@@ -434,7 +440,7 @@ literal:
 			continue;
 
 		case CT_INT:
-			/* scan an integer as if by strtol/strtoul */
+			/* scan an integer as if by strtoq/strtouq */
 #ifdef hardway
 			if (width == 0 || width > sizeof(buf) - 1)
 				width = sizeof(buf) - 1;
@@ -552,7 +558,7 @@ literal:
 				(void) ungetc(c, fp);
 			}
 			if ((flags & SUPPRESS) == 0) {
-				u_long res;
+				u_quad_t res;
 
 				*p = 0;
 				res = (*ccfn)(buf, (char **)NULL, base);
@@ -562,6 +568,8 @@ literal:
 					*va_arg(ap, short *) = res;
 				else if (flags & LONG)
 					*va_arg(ap, long *) = res;
+				else if (flags & QUAD)
+					*va_arg(ap, quad_t *) = res;
 				else
 					*va_arg(ap, int *) = res;
 				nassigned++;
@@ -651,7 +659,9 @@ literal:
 
 				*p = 0;
 				res = strtod(buf,(char **) NULL);
-				if (flags & LONG)
+				if (flags & LONGDBL)
+					*va_arg(ap, long double *) = res;
+				else if (flags & LONG)
 					*va_arg(ap, double *) = res;
 				else
 					*va_arg(ap, float *) = res;
diff --git a/stdlib.subproj/strtod.c b/stdlib.subproj/strtod.c
index 0ed39d8..05c075e 100644
--- a/stdlib.subproj/strtod.c
+++ b/stdlib.subproj/strtod.c
@@ -386,7 +386,7 @@ extern double rnd_prod(double, double), rnd_quot(double, double);
 #ifdef __cplusplus
 extern "C" double strtod(const char *s00, char **se);
 extern "C" char *__dtoa(double d, int mode, int ndigits,
-			int *decpt, int *sign, char **rve);
+			int *decpt, int *sign, char **rve, char **resultp);
 #endif
 
  struct
@@ -398,8 +398,6 @@ Bigint {
 
  typedef struct Bigint Bigint;
 
- static Bigint *freelist[Kmax+1];
-
  static Bigint *
 Balloc
 #ifdef KR_headers
@@ -411,18 +409,13 @@ Balloc
 	int x;
 	Bigint *rv;
 
-	if (rv = freelist[k]) {
-		freelist[k] = rv->next;
-		}
-	else {
-		x = 1 << k;
-		rv = (Bigint *)MALLOC(sizeof(Bigint) + (x-1)*sizeof(Long));
-		rv->k = k;
-		rv->maxwds = x;
-		}
+	x = 1 << k;
+	rv = (Bigint *)malloc(sizeof(Bigint) + (x-1)*sizeof(Long));
+	rv->k = k;
+	rv->maxwds = x;
 	rv->sign = rv->wds = 0;
 	return rv;
-	}
+}
 
  static void
 Bfree
@@ -432,11 +425,8 @@ Bfree
 	(Bigint *v)
 #endif
 {
-	if (v) {
-		v->next = freelist[v->k];
-		freelist[v->k] = v;
-		}
-	}
+	free(v);
+}
 
 #define Bcopy(x,y) memcpy((char *)&x->sign, (char *)&y->sign, \
 y->wds*sizeof(Long) + 2*sizeof(int))
@@ -1916,9 +1906,9 @@ quorem
 __dtoa
 #ifdef KR_headers
 	(d, mode, ndigits, decpt, sign, rve)
-	double d; int mode, ndigits, *decpt, *sign; char **rve;
+	double d; int mode, ndigits, *decpt, *sign; char **rve, char **resultp;
 #else
-	(double d, int mode, int ndigits, int *decpt, int *sign, char **rve)
+	(double d, int mode, int ndigits, int *decpt, int *sign, char **rve, char **resultp)
 #endif
 {
  /*	Arguments ndigits, decpt, sign are similar to those
@@ -1966,15 +1956,6 @@ __dtoa
 	Bigint *b, *b1, *delta, *mlo, *mhi, *S;
 	double d2, ds, eps;
 	char *s, *s0;
-	static Bigint *result;
-	static int result_k;
-
-	if (result) {
-		result->k = result_k;
-		result->maxwds = 1 << result_k;
-		Bfree(result);
-		result = 0;
-		}
 
 	if (word0(d) & Sign_bit) {
 		/* set sign for everything, including 0's and NaNs */
@@ -2136,11 +2117,8 @@ __dtoa
 			if (i <= 0)
 				i = 1;
 		}
-	j = sizeof(ULong);
-	for(result_k = 0; sizeof(Bigint) - sizeof(ULong) + j <= i;
-		j <<= 1) result_k++;
-	result = Balloc(result_k);
-	s = s0 = (char *)result;
+	*resultp = (char *) malloc(i + 1);
+	s = s0 = *resultp;
 
 	if (ilim >= 0 && ilim <= Quick_max && try_quick) {
 
diff --git a/string.subproj/memccpy.c b/string.subproj/memccpy.c
index d925f12..657b4f9 100644
--- a/string.subproj/memccpy.c
+++ b/string.subproj/memccpy.c
@@ -67,9 +67,10 @@ memccpy(t, f, c, n)
 	if (n) {
 		register unsigned char *tp = t;
 		register const unsigned char *fp = f;
+		register unsigned char uc = c;
 		do {
-			if ((*tp++ = *fp++) == c)
-				return (t);
+			if ((*tp++ = *fp++) == uc)
+				return (tp);
 		} while (--n != 0);
 	}
 	return (0);
diff --git a/sys.subproj/gettimeofday.c b/sys.subproj/gettimeofday.c
index d07239b..8aa14ac 100644
--- a/sys.subproj/gettimeofday.c
+++ b/sys.subproj/gettimeofday.c
@@ -36,21 +36,26 @@ int gettimeofday (struct timeval *tp, struct timezone *tzp)
 {
         static int validtz = 0;
         static struct timezone cached_tz = {0};
+        struct timeval localtv;
+  
+        if (tzp && (tp == NULL) && (validtz == 0)) {
+                tp = &localtv;
+        }
 
         if (syscall (SYS_gettimeofday, tp, tzp) < 0) {
                 return (-1);
         }
-        if (validtz == 0)  {
-		struct tm *localtm = localtime ((time_t *)&tp->tv_sec);
-                cached_tz.tz_dsttime = localtm->tm_isdst;
-                cached_tz.tz_minuteswest =
-                        (-localtm->tm_gmtoff / SECSPERMIN) +
-                        (localtm->tm_isdst * MINSPERHOUR);
-                validtz = 1;
-        }
         if (tzp) {
-          tzp->tz_dsttime = cached_tz.tz_dsttime;
-          tzp->tz_minuteswest = cached_tz.tz_minuteswest;
+		if (validtz == 0)  {
+			struct tm *localtm = localtime ((time_t *)&tp->tv_sec);
+			cached_tz.tz_dsttime = localtm->tm_isdst;
+			cached_tz.tz_minuteswest =
+				(-localtm->tm_gmtoff / SECSPERMIN) +
+				(localtm->tm_isdst * MINSPERHOUR);
+			validtz = 1;
+		}
+		tzp->tz_dsttime = cached_tz.tz_dsttime;
+		tzp->tz_minuteswest = cached_tz.tz_minuteswest;
         }
         return (0);
 }
diff --git a/sys.subproj/i386.subproj/vfork.s b/sys.subproj/i386.subproj/vfork.s
index 714a205..edd5f4f 100644
--- a/sys.subproj/i386.subproj/vfork.s
+++ b/sys.subproj/i386.subproj/vfork.s
@@ -24,6 +24,7 @@
  */
 #include "SYS.h"
 
+#if 0
 LEAF(_vfork, 0) 
 	CALL_EXTERN(__cthread_fork_prepare)
 #if defined(__DYNAMIC__)
@@ -161,4 +162,24 @@ L2:
 	CALL_EXTERN_AGAIN(__cthread_fork_parent)
 	pop	%eax
 	ret		
+#else
+
+LEAF(_vfork, 0)
+        popl    %ecx
+        movl    $SYS_vfork,%eax;      // code for vfork -> eax
+        UNIX_SYSCALL_TRAP;              // do the system call
+        jnb     L1                      // jump if CF==0
+        pushl   %ecx
+        BRANCH_EXTERN(cerror)
+
+L1:
+        orl     %edx,%edx       // CF=OF=0,  ZF set if zero result
+        jz      L2              // parent, since r1 == 0 in parent, 1 in child
+        xorl    %eax,%eax       // zero eax
+        jmp     *%ecx
+
+L2:
+        jmp     *%ecx
+
+#endif
 
diff --git a/sys.subproj/ppc.subproj/_longjmp.s b/sys.subproj/ppc.subproj/_longjmp.s
index 4591e54..6bdeb02 100644
--- a/sys.subproj/ppc.subproj/_longjmp.s
+++ b/sys.subproj/ppc.subproj/_longjmp.s
@@ -34,11 +34,159 @@
  *	8 September 1998	Matt Watson (mwatson@apple.com)
  *		Created. Derived from longjmp.s
  */
-#include "SYS.h"
+
 #include <architecture/ppc/asm_help.h>
 #include "_setjmp.h"
 
+#define	VRSave	256
+
+/* special flag bit definitions copied from /osfmk/ppc/thread_act.h */
+
+#define floatUsedbit	1
+#define vectorUsedbit	2
+
+
+#if defined(__DYNAMIC__)
+        .data
+	.non_lazy_symbol_pointer
+	.align 2
+L_memmove$non_lazy_ptr:
+	.indirect_symbol _memmove
+	.long 0
+	.non_lazy_symbol_pointer
+	.align 2
+L__cpu_has_altivec$non_lazy_ptr:
+	.indirect_symbol __cpu_has_altivec
+	.long 0
+        .text
+#endif        
+        
 LEAF(__longjmp)
+
+        ; need to restore FPRs or VRs?
+        
+        lwz	r5,JMP_flags(r3)
+        lwz	r6,JMP_addr_at_setjmp(r3)
+        rlwinm	r7,r5,0,vectorUsedbit,vectorUsedbit
+        rlwinm	r8,r5,0,floatUsedbit,floatUsedbit
+        cmpw	cr1,r3,r6		; jmp_buf still at same address?
+        cmpwi	cr3,r7,0		; set cr3 iff VRs in use (non-volatile CR)
+        cmpwi	cr4,r8,0		; set cr4 iff FPRs in use (non-volatile CR)
+        beq+	cr1,LRestoreVRs
+        
+        ; jmp_buf was moved since setjmp (or is uninitialized.)
+        ; We must move VRs and FPRs to be quadword aligned at present address.
+        
+        stw	r3,JMP_addr_at_setjmp(r3) ; update, in case we longjmp to this again
+        mr	r31,r4			; save "val" arg across memmove
+        mr	r30,r3			; and jmp_buf ptr
+        addi	r3,r3,JMP_vr_base_addr
+        addi	r4,r6,JMP_vr_base_addr
+        rlwinm	r3,r3,0,0,27		; r3 <- QW aligned addr where they should be
+        rlwinm	r4,r4,0,0,27		; r4 <- QW aligned addr where they originally were
+        sub	r7,r4,r6		; r7 <- offset of VRs/FPRs within jmp_buf
+        add	r4,r30,r7		; r4 <- where they are now
+        li	r5,(JMP_buf_end - JMP_vr_base_addr)
+#if defined(__DYNAMIC__)
+        bcl     20,31,1f		; Get pic-base
+1:      mflr    r12			
+        addis   r12, r12, ha16(L_memmove$non_lazy_ptr - 1b)
+        lwz     r12, lo16(L_memmove$non_lazy_ptr - 1b)(r12)
+        mtctr   r12			; Get address left by dyld
+        bctrl
+#else
+	bl	_memmove
+#endif
+        mr	r3,r30
+        mr	r4,r31
+        
+        ; Restore VRs iff any
+        ;	cr3 - bne if VRs
+        ;	cr4 - bne if FPRs
+        
+LRestoreVRs:
+        beq+	cr3,LZeroVRSave		; no VRs
+        lwz	r0,JMP_vrsave(r3)
+        addi	r6,r3,JMP_vr_base_addr
+        cmpwi	r0,0			; any live VRs?
+        mtspr	VRSave,r0
+        beq+	LRestoreFPRs
+        lvx	v20,0,r6
+        li	r7,16*1
+        lvx	v21,r7,r6
+        li	r7,16*2
+        lvx	v22,r7,r6
+        li	r7,16*3
+        lvx	v23,r7,r6
+        li	r7,16*4
+        lvx	v24,r7,r6
+        li	r7,16*5
+        lvx	v25,r7,r6
+        li	r7,16*6
+        lvx	v26,r7,r6
+        li	r7,16*7
+        lvx	v27,r7,r6
+        li	r7,16*8
+        lvx	v28,r7,r6
+        li	r7,16*9
+        lvx	v29,r7,r6
+        li	r7,16*10
+        lvx	v30,r7,r6
+        li	r7,16*11
+        lvx	v31,r7,r6
+        b	LRestoreFPRs		; skip zeroing VRSave
+        
+        ; Zero VRSave iff Altivec is supported, but VRs were not in use
+        ; at setjmp time.  This covers the case where VRs are first used after
+        ; the setjmp but before the longjmp, and where VRSave is nonzero at
+        ; the longjmp.  We need to zero it now, or it will always remain
+        ; nonzero since they are sticky bits.
+
+LZeroVRSave:
+#if defined(__DYNAMIC__)
+        bcl	20,31,1f
+1:	mflr	r9			; get our address
+        addis	r6,r9,ha16(L__cpu_has_altivec$non_lazy_ptr - 1b)
+        lwz	r7,lo16(L__cpu_has_altivec$non_lazy_ptr - 1b)(r6)
+        lwz	r7,0(r7)		; load the flag
+#else
+        lis	r7, ha16(__cpu_has_altivec)
+	lwz	r7, lo16(__cpu_has_altivec)(r7)
+#endif
+	cmpwi	r7,0
+        li	r8,0
+        beq	LRestoreFPRs		; no Altivec, so skip
+        mtspr	VRSave,r8
+        
+        ; Restore FPRs if any
+        ;	cr4 - bne iff FPRs
+        
+LRestoreFPRs:
+        beq	cr4,LRestoreGPRs	; FPRs not in use at setjmp
+        addi	r6,r3,JMP_fp_base_addr
+        rlwinm	r6,r6,0,0,27		; mask off low 4 bits to qw align
+        lfd	f14,0*8(r6)
+        lfd	f15,1*8(r6)
+        lfd	f16,2*8(r6)
+        lfd	f17,3*8(r6)
+        lfd	f18,4*8(r6)
+        lfd	f19,5*8(r6)
+        lfd	f20,6*8(r6)
+        lfd	f21,7*8(r6)
+        lfd	f22,8*8(r6)
+        lfd	f23,9*8(r6)
+        lfd	f24,10*8(r6)
+        lfd	f25,11*8(r6)
+        lfd	f26,12*8(r6)
+        lfd	f27,13*8(r6)
+        lfd	f28,14*8(r6)
+        lfd	f29,15*8(r6)
+        lfd	f30,16*8(r6)
+        lfd	f31,17*8(r6)
+        
+        ; Restore GPRs
+        
+LRestoreGPRs:
 	lwz r31, JMP_r31(r3)
 	/* r1, r14-r30 */
 	lwz r1,  JMP_r1 (r3)
diff --git a/sys.subproj/ppc.subproj/_setjmp.h b/sys.subproj/ppc.subproj/_setjmp.h
index e97255c..8a78817 100644
--- a/sys.subproj/ppc.subproj/_setjmp.h
+++ b/sys.subproj/ppc.subproj/_setjmp.h
@@ -28,6 +28,14 @@
  *
  */
 
+/* NOTE: jmp_bufs are only 4-byte aligned.  This means we
+ * need to pad before the VR and FPR save areas, so that they
+ * can be naturally aligned in the buffer.  In case a jmp_buf
+ * is bcopy'd to a different alignment between the setjmp
+ * and longjmp, we need to save the jmp_buf address in the
+ * jmp_buf at setjmp time, so we can realign before reloading.
+ */
+ 
 #define JMP_r1	0x00
 #define JMP_r2	0x04
 #define JMP_r13	0x08
@@ -55,3 +63,13 @@
 #define JMP_xer	0x60
 #define JMP_sig	0x64
 #define JMP_SIGFLAG 0x68
+#define JMP_flags 0x6c
+#define JMP_vrsave 0x70
+#define JMP_addr_at_setjmp 0x74
+/* 12 bytes padding here */
+#define JMP_vr_base_addr 0x84
+/* save room for 12 VRs (v20-v31), or 0xC0 bytes */
+#define JMP_fp_base_addr 0x144
+/* save room for 18 FPRs (f14-f31), or 0x90 bytes */
+#define JMP_buf_end 0x1d4
+
diff --git a/sys.subproj/ppc.subproj/_setjmp.s b/sys.subproj/ppc.subproj/_setjmp.s
index 2be62c8..c69f9ad 100644
--- a/sys.subproj/ppc.subproj/_setjmp.s
+++ b/sys.subproj/ppc.subproj/_setjmp.s
@@ -33,10 +33,20 @@
  *		Created. Derived from setjmp.s
  */
 
-#include "SYS.h"
+
 #include <architecture/ppc/asm_help.h>
 #include "_setjmp.h"
 
+#define	VRSave	256
+
+/* special flag bit definitions copied from /osfmk/ppc/thread_act.h */
+
+#define floatUsedbit	1
+#define vectorUsedbit	2
+
+#define	FlagsFastTrap	0x7FF3
+
+
 LEAF(__setjmp)
 	stw r31, JMP_r31(r3)
 	/* r1, r2, r13-r30 */
@@ -68,6 +78,77 @@ LEAF(__setjmp)
 	stw r5, JMP_lr(r3)
 	stw r6, JMP_ctr(r3)
 	stw r7, JMP_xer(r3)
-	li r3, 0
+        
+        mr	r31,r3				; save jmp_buf ptr
+        li	r0,FlagsFastTrap
+        sc					; get FPR-inuse and VR-inuse flags from kernel
+        rlwinm	r4,r3,0,floatUsedbit,floatUsedbit
+        rlwinm.	r5,r3,0,vectorUsedbit,vectorUsedbit
+        cmpwi	cr1,r4,0			; set CR1 bne iff FPRs in use
+        stw	r3,JMP_flags(r31)
+        stw	r31,JMP_addr_at_setjmp(r31)
+        mr	r3,r31				; restore jmp_buf ptr
+        lwz	r31,JMP_r31(r31)
+        beq	LSaveFPRsIfNecessary		; skip if vectorUsedbit was 0
+        
+        ; must save VRs and VRSAVE
+        
+        mfspr	r4,VRSave
+        andi.	r0,r4,0xFFF			; we only care about v20-v31
+        stw	r0,JMP_vrsave(r3)		; set up effective VRSAVE
+        beq	LSaveFPRsIfNecessary		; no live non-volatile VRs
+        addi	r6,r3,JMP_vr_base_addr
+        stvx	v20,0,r6
+        li	r4,16*1
+        stvx	v21,r4,r6
+        li	r4,16*2
+        stvx	v22,r4,r6
+        li	r4,16*3
+        stvx	v23,r4,r6
+        li	r4,16*4
+        stvx	v24,r4,r6
+        li	r4,16*5
+        stvx	v25,r4,r6
+        li	r4,16*6
+        stvx	v26,r4,r6
+        li	r4,16*7
+        stvx	v27,r4,r6
+        li	r4,16*8
+        stvx	v28,r4,r6
+        li	r4,16*9
+        stvx	v29,r4,r6
+        li	r4,16*10
+        stvx	v30,r4,r6
+        li	r4,16*11
+        stvx	v31,r4,r6
+        
+        ; must save FPRs if they are live in this thread
+        ;	CR1 = bne iff FPRs are in use
+        
+LSaveFPRsIfNecessary:
+        beq	cr1,LExit			; FPRs not in use
+        addi	r6,r3,JMP_fp_base_addr
+        rlwinm	r6,r6,0,0,27			; mask off low 4 bits to qw align
+        stfd	f14,0*8(r6)
+        stfd	f15,1*8(r6)
+        stfd	f16,2*8(r6)
+        stfd	f17,3*8(r6)
+        stfd	f18,4*8(r6)
+        stfd	f19,5*8(r6)
+        stfd	f20,6*8(r6)
+        stfd	f21,7*8(r6)
+        stfd	f22,8*8(r6)
+        stfd	f23,9*8(r6)
+        stfd	f24,10*8(r6)
+        stfd	f25,11*8(r6)
+        stfd	f26,12*8(r6)
+        stfd	f27,13*8(r6)
+        stfd	f28,14*8(r6)
+        stfd	f29,15*8(r6)
+        stfd	f30,16*8(r6)
+        stfd	f31,17*8(r6)
+
+LExit:
+	li 	r3, 0
 	blr
 
diff --git a/sys.subproj/ppc.subproj/ur_cthread.s b/sys.subproj/ppc.subproj/ur_cthread.s
index f3695ba..50ff2be 100644
--- a/sys.subproj/ppc.subproj/ur_cthread.s
+++ b/sys.subproj/ppc.subproj/ur_cthread.s
@@ -21,8 +21,8 @@
  */
         .text
         .align 2
-        .globl __pthread_self
-__pthread_self:
+        .globl _pthread_self
+_pthread_self:
         li r0, 0x7FF2
         sc
         blr
diff --git a/sys.subproj/ppc.subproj/vfork.s b/sys.subproj/ppc.subproj/vfork.s
index 6a3277a..14bc4f3 100644
--- a/sys.subproj/ppc.subproj/vfork.s
+++ b/sys.subproj/ppc.subproj/vfork.s
@@ -29,7 +29,7 @@
  *
  */
 
-#if 1
+#if 0
 #import <sys/syscall.h>
 #import <architecture/ppc/asm_help.h>
 #import	<architecture/ppc/pseudo_inst.h>
diff --git a/threads.subproj/Makefile b/threads.subproj/Makefile
index 66f86ea..a61e757 100644
--- a/threads.subproj/Makefile
+++ b/threads.subproj/Makefile
@@ -14,7 +14,7 @@ PROJECT_TYPE = Component
 
 HFILES = cthread_internals.h cthreads.h
 
-CFILES = cprocs.c cthreads.c lu_utils.c mig_support.c threads_data.c
+CFILES = cprocs.c cthreads.c lu_utils.c mig_support.c
 
 SUBPROJECTS = i386.subproj ppc.subproj
 
diff --git a/threads.subproj/PB.project b/threads.subproj/PB.project
index 0fbe5d1..e63fd07 100644
--- a/threads.subproj/PB.project
+++ b/threads.subproj/PB.project
@@ -2,7 +2,7 @@
     DYNAMIC_CODE_GEN = YES; 
     FILESTABLE = {
         H_FILES = (cthread_internals.h, cthreads.h); 
-        OTHER_LINKED = (cprocs.c, cthreads.c, lu_utils.c, mig_support.c, threads_data.c); 
+        OTHER_LINKED = (cprocs.c, cthreads.c, lu_utils.c, mig_support.c); 
         OTHER_SOURCES = (Makefile.preamble, Makefile, Makefile.postamble); 
         PROJECT_HEADERS = (cthread_internals.h, cthreads.h); 
         SUBPROJECTS = (i386.subproj, ppc.subproj); 
diff --git a/threads.subproj/i386.subproj/thread.c b/threads.subproj/i386.subproj/thread.c
index ee31e52..64595c3 100644
--- a/threads.subproj/i386.subproj/thread.c
+++ b/threads.subproj/i386.subproj/thread.c
@@ -48,7 +48,7 @@ _pthread_set_self(p)
 }
 
 void *
-_pthread_self()
+pthread_self()
 {
 	asm("movl	$0, %eax");
 	asm("lcall	$0x3b, $0");
diff --git a/threads.subproj/threads_data.c b/threads.subproj/threads_data.c
deleted file mode 100644
index 587b938..0000000
--- a/threads.subproj/threads_data.c
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- */
-/*
- * This file contains global data and the size of the global data can NOT
- * change or otherwise it would make the shared library incompatable.  It
- * is padded so that new data can take the place of storage occupied by part
- * of it.
- */
-int msg_send_timeout = 100;	/* milliseconds */
-int msg_receive_timeout = 10;	/* milliseconds */
-int mutex_spin_limit = 0;
-int cthread_stack_mask = 0;
-extern void cthread_init();
-unsigned int cproc_default_stack_size = 1000000;
-int condition_spin_limit = 0;
-int condition_yield_limit = 7;
-unsigned int initial_stack_boundary = 0;
-unsigned int cthread_stack_base = 0;	/* Base for stack allocation */
-int	malloc_lock = 0;			/* 
-					 * Needs to be shared between malloc.o
-					 * and malloc_utils.o
-					 */
-
-/* global data padding, must NOT be static */
-char _threads_data_padding[208] = { 0 };
diff --git a/util.subproj/pty.c b/util.subproj/pty.c
index 8c9fc0e..aa0b2ad 100644
--- a/util.subproj/pty.c
+++ b/util.subproj/pty.c
@@ -82,7 +82,7 @@ int openpty(amaster, aslave, name, termp, winp)
 	else
 		ttygid = -1;
 
-	for (cp1 = "pqrs"; *cp1; cp1++) {
+	for (cp1 = "pqrstuvwxy"; *cp1; cp1++) {
 		line[8] = *cp1;
 		for (cp2 = "0123456789abcdef"; *cp2; cp2++) {
 			line[5] = 'p';
-- 
2.47.2