]> git.saurik.com Git - apple/libc.git/commitdiff
Libc-186.tar.gz mac-os-x-101 mac-os-x-1011 mac-os-x-1012 mac-os-x-1013 mac-os-x-1014 mac-os-x-1015 v186
authorApple <opensource@apple.com>
Wed, 29 Aug 2001 23:32:14 +0000 (23:32 +0000)
committerApple <opensource@apple.com>
Wed, 29 Aug 2001 23:32:14 +0000 (23:32 +0000)
33 files changed:
Makefile.postamble
gen.subproj/crypt.c
gen.subproj/popen.c
gen.subproj/ppc.subproj/Makefile
gen.subproj/ppc.subproj/PB.project
gen.subproj/ppc.subproj/bcopy.s [deleted file]
gen.subproj/ppc.subproj/blockmoof.s [new file with mode: 0755]
gen.subproj/ppc.subproj/memcpy.s [deleted file]
gen.subproj/ppc.subproj/memmove.s [deleted file]
gen.subproj/scalable_malloc.c
locale.subproj/rune.c
locale.subproj/setlocale.c
mach.subproj/mach_init.c
pthreads.subproj/pthread.c
pthreads.subproj/pthread_cond.c
pthreads.subproj/pthread_internals.h
pthreads.subproj/pthread_mutex.c
stdio.subproj/vfprintf.c
stdio.subproj/vfscanf.c
stdlib.subproj/strtod.c
string.subproj/memccpy.c
sys.subproj/gettimeofday.c
sys.subproj/i386.subproj/vfork.s
sys.subproj/ppc.subproj/_longjmp.s
sys.subproj/ppc.subproj/_setjmp.h
sys.subproj/ppc.subproj/_setjmp.s
sys.subproj/ppc.subproj/ur_cthread.s
sys.subproj/ppc.subproj/vfork.s
threads.subproj/Makefile
threads.subproj/PB.project
threads.subproj/i386.subproj/thread.c
threads.subproj/threads_data.c [deleted file]
util.subproj/pty.c

index e702b54bedb72bc3c0c62fd825edb497ea45f61a..2083c132c52c254e8513e99767e0bc9d8d2673b2 100644 (file)
@@ -5,6 +5,7 @@ PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(DEBUG_SUFFIX)$(LIBRARY_EXT)
 PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(PROFILE_SUFFIX)$(LIBRARY_EXT)
 PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(STATIC_SUFFIX)$(LIBRARY_EXT)
 RECURSIVE_FLAGS += "LINK_SUBPROJECTS = NO"
+OTHER_CFLAGS += -no-cpp-precomp -force_cpusubtype_ALL
 
 static:
        $(SILENT) unset $(CUMULATIVE_VARIABLES) ||: ; \
index 5e2caecce525d792e46ca4a8302a218717eb4ecd..2f569539c0cc029ddd70d66af9a46cc68420959c 100644 (file)
@@ -59,6 +59,7 @@
 #include <unistd.h>
 #include <limits.h>
 #include <pwd.h>
+#include <stdlib.h>
 
 /*
  * UNIX password, and DES, encryption.
@@ -465,19 +466,24 @@ static unsigned char itoa64[] =           /* 0..63 => ascii-64 */
 static unsigned char a64toi[128];      /* ascii-64 => 0..63 */
 
 /* Initial key schedule permutation */
-static C_block PC1ROT[64/CHUNKBITS][1<<CHUNKBITS];
+// static C_block      PC1ROT[64/CHUNKBITS][1<<CHUNKBITS];
+static C_block *PC1ROT;
 
 /* Subsequent key schedule rotation permutations */
-static C_block PC2ROT[2][64/CHUNKBITS][1<<CHUNKBITS];
+// static C_block      PC2ROT[2][64/CHUNKBITS][1<<CHUNKBITS];
+static C_block *PC2ROT[2];
 
 /* Initial permutation/expansion table */
-static C_block IE3264[32/CHUNKBITS][1<<CHUNKBITS];
+// static C_block      IE3264[32/CHUNKBITS][1<<CHUNKBITS];
+static C_block *IE3264;
 
 /* Table that combines the S, P, and E operations.  */
-static long SPE[2][8][64];
+// static long SPE[2][8][64];
+static long *SPE;
 
 /* compressed/interleaved => final permutation table */
-static C_block CF6464[64/CHUNKBITS][1<<CHUNKBITS];
+// static C_block      CF6464[64/CHUNKBITS][1<<CHUNKBITS];
+static C_block *CF6464;
 
 
 /* ==================================== */
@@ -606,13 +612,13 @@ STATIC int des_setkey(key)
                des_ready = 1;
        }
 
-       PERM6464(K,K0,K1,(unsigned char *)key,(C_block *)PC1ROT);
+       PERM6464(K,K0,K1,(unsigned char *)key,PC1ROT);
        key = (char *)&KS[0];
        STORE(K&~0x03030303L, K0&~0x03030303L, K1, *(C_block *)key);
        for (i = 1; i < 16; i++) {
                key += sizeof(C_block);
                STORE(K,K0,K1,*(C_block *)key);
-               ptabp = (C_block *)PC2ROT[Rotates[i]-1];
+               ptabp = PC2ROT[Rotates[i]-1];
                PERM6464(K,K0,K1,(unsigned char *)key,ptabp);
                STORE(K&~0x03030303L, K0&~0x03030303L, K1, *(C_block *)key);
        }
@@ -667,8 +673,8 @@ STATIC int des_cipher(in, out, salt, num_iter)
        R1 = (R1 >> 1) & 0x55555555L;
        L1 = R0 | R1;           /* L1 is the odd-numbered input bits */
        STORE(L,L0,L1,B);
-       PERM3264(L,L0,L1,B.b,  (C_block *)IE3264);      /* even bits */
-       PERM3264(R,R0,R1,B.b+4,(C_block *)IE3264);      /* odd bits */
+       PERM3264(L,L0,L1,B.b,IE3264);   /* even bits */
+       PERM3264(R,R0,R1,B.b+4,IE3264); /* odd bits */
 
        if (num_iter >= 0)
        {               /* encryption */
@@ -689,14 +695,14 @@ STATIC int des_cipher(in, out, salt, num_iter)
 #define        SPTAB(t, i)     (*(long *)((unsigned char *)t + i*(sizeof(long)/4)))
 #if defined(gould)
                        /* use this if B.b[i] is evaluated just once ... */
-#define        DOXOR(x,y,i)    x^=SPTAB(SPE[0][i],B.b[i]); y^=SPTAB(SPE[1][i],B.b[i]);
+#define        DOXOR(x,y,i)    x^=SPTAB(&SPE[i * 64],B.b[i]); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],B.b[i]);
 #else
 #if defined(pdp11)
                        /* use this if your "long" int indexing is slow */
-#define        DOXOR(x,y,i)    j=B.b[i]; x^=SPTAB(SPE[0][i],j); y^=SPTAB(SPE[1][i],j);
+#define        DOXOR(x,y,i)    j=B.b[i]; x^=SPTAB(&SPE[i * 64],j); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],j);
 #else
                        /* use this if "k" is allocated to a register ... */
-#define        DOXOR(x,y,i)    k=B.b[i]; x^=SPTAB(SPE[0][i],k); y^=SPTAB(SPE[1][i],k);
+#define        DOXOR(x,y,i)    k=B.b[i]; x^=SPTAB(&SPE[i * 64],k); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],k);
 #endif
 #endif
 
@@ -731,7 +737,7 @@ STATIC int des_cipher(in, out, salt, num_iter)
        L0 = ((L0 >> 3) & 0x0f0f0f0fL) | ((L1 << 1) & 0xf0f0f0f0L);
        L1 = ((R0 >> 3) & 0x0f0f0f0fL) | ((R1 << 1) & 0xf0f0f0f0L);
        STORE(L,L0,L1,B);
-       PERM6464(L,L0,L1,B.b, (C_block *)CF6464);
+       PERM6464(L,L0,L1,B.b,CF6464);
 #if defined(MUST_ALIGN)
        STORE(L,L0,L1,B);
        out[0] = B.b[0]; out[1] = B.b[1]; out[2] = B.b[2]; out[3] = B.b[3];
@@ -781,6 +787,9 @@ STATIC void init_des()
 #ifdef DEBUG
        prtab("pc1tab", perm, 8);
 #endif
+       PC1ROT = (C_block *)calloc(sizeof(C_block), (64/CHUNKBITS) * (1<<CHUNKBITS));
+       for (i = 0; i < 2; i++)
+               PC2ROT[i] = (C_block *)calloc(sizeof(C_block), (64/CHUNKBITS) * (1<<CHUNKBITS));
        init_perm(PC1ROT, perm, 8, 8);
 
        /*
@@ -829,6 +838,7 @@ STATIC void init_des()
 #ifdef DEBUG
        prtab("ietab", perm, 8);
 #endif
+       IE3264 = (C_block *)calloc(sizeof(C_block), (32/CHUNKBITS) * (1<<CHUNKBITS));
        init_perm(IE3264, perm, 4, 8);
 
        /*
@@ -846,6 +856,8 @@ STATIC void init_des()
 #ifdef DEBUG
        prtab("cftab", perm, 8);
 #endif
+       CF6464 = (C_block *)calloc(sizeof(C_block), (64/CHUNKBITS) * (1<<CHUNKBITS));
+       SPE = (long *)calloc(sizeof(long), 2 * 8 * 64);
        init_perm(CF6464, perm, 8, 8);
 
        /*
@@ -873,11 +885,11 @@ STATIC void init_des()
                        k = 0;
                        for (i = 24; --i >= 0; )
                                k = (k<<1) | tmp32[perm[i]-1];
-                       TO_SIX_BIT(SPE[0][tableno][j], k);
+                       TO_SIX_BIT(SPE[(tableno * 64) + j], k);
                        k = 0;
                        for (i = 24; --i >= 0; )
                                k = (k<<1) | tmp32[perm[i+24]-1];
-                       TO_SIX_BIT(SPE[1][tableno][j], k);
+                       TO_SIX_BIT(SPE[(8 * 64) + (tableno * 64) + j], k);
                }
        }
 }
@@ -891,7 +903,7 @@ STATIC void init_des()
  * "perm" must be all-zeroes on entry to this routine.
  */
 STATIC void init_perm(perm, p, chars_in, chars_out)
-       C_block perm[64/CHUNKBITS][1<<CHUNKBITS];
+       C_block *perm;
        unsigned char p[64];
        int chars_in, chars_out;
 {
@@ -905,7 +917,7 @@ STATIC void init_perm(perm, p, chars_in, chars_out)
                l = 1<<(l&(CHUNKBITS-1));       /* mask for this bit */
                for (j = 0; j < (1<<CHUNKBITS); j++) {  /* each chunk value */
                        if ((j & l) != 0)
-                               perm[i][j].b[k>>3] |= 1<<(k&07);
+                               perm[(i * (1<<CHUNKBITS)) + j].b[k>>3] |= 1<<(k&07);
                }
        }
 }
index 7729280278f3a0320eb23839d9636f02df3352d9..885d6c6bf13ba8af26e8b9ea7514c6caf7e1b0ca 100644 (file)
@@ -2,13 +2,13 @@
  * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
- * 
+ *
  * The contents of this file constitute Original Code as defined in and
  * are subject to the Apple Public Source License Version 1.1 (the
  * "License").  You may not use this file except in compliance with the
  * License.  Please obtain a copy of the License at
  * http://www.apple.com/publicsource and read it before using this file.
- * 
+ *
  * This Original Code and all software distributed under the License are
  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -16,7 +16,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  * License for the specific language governing rights and limitations
  * under the License.
- * 
+ *
  * @APPLE_LICENSE_HEADER_END@
  */
 /*
@@ -55,7 +55,6 @@
  * SUCH DAMAGE.
  */
 
-
 #include <sys/param.h>
 #include <sys/wait.h>
 #include <sys/socket.h>
@@ -67,6 +66,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <paths.h>
+#include <crt_externs.h>
+
+#define environ *(_NSGetEnviron())
 
 static struct pid {
        struct pid *next;
@@ -81,38 +83,57 @@ popen(command, type)
        struct pid *cur;
        FILE *iop;
        int pdes[2], pid, twoway;
+       char *argv[4];
+       struct pid *p;
 
        if (strchr(type, '+')) {
                twoway = 1;
                type = "r+";
-               if (socketpair(AF_UNIX, SOCK_STREAM, 0, pdes) < 0)
-                       return (NULL);
+                if (socketpair(AF_UNIX, SOCK_STREAM, 0, pdes) < 0)
+                        return (NULL);
        } else  {
                twoway = 0;
-               if (*type != 'r' && *type != 'w' || type[1] ||
-                   (pipe(pdes) < 0))
+               if ((*type != 'r' && *type != 'w') || type[1])
                        return (NULL);
        }
+       if (pipe(pdes) < 0)
+               return (NULL);
 
-       if ((cur = malloc(sizeof(struct pid))) == NULL)
+       if ((cur = malloc(sizeof(struct pid))) == NULL) {
+               (void)close(pdes[0]);
+               (void)close(pdes[1]);
                return (NULL);
+       }
+
+       argv[0] = "sh";
+       argv[1] = "-c";
+       argv[2] = (char *)command;
+       argv[3] = NULL;
 
        switch (pid = vfork()) {
        case -1:                        /* Error. */
                (void)close(pdes[0]);
                (void)close(pdes[1]);
-               (void)free(cur);
+               free(cur);
                return (NULL);
                /* NOTREACHED */
        case 0:                         /* Child. */
                if (*type == 'r') {
+                       /*
+                        * The _dup2() to STDIN_FILENO is repeated to avoid
+                        * writing to pdes[1], which might corrupt the
+                        * parent's copy.  This isn't good enough in
+                        * general, since the _exit() is no return, so
+                        * the compiler is free to corrupt all the local
+                        * variables.
+                        */
+                       (void)close(pdes[0]);
                        if (pdes[1] != STDOUT_FILENO) {
                                (void)dup2(pdes[1], STDOUT_FILENO);
                                (void)close(pdes[1]);
-                               pdes[1] = STDOUT_FILENO;
-                       }
-                       (void) close(pdes[0]);
-                       if (twoway && (pdes[1] != STDIN_FILENO))
+                               if (twoway)
+                                       (void)dup2(STDOUT_FILENO, STDIN_FILENO);
+                       } else if (twoway && (pdes[1] != STDIN_FILENO))
                                (void)dup2(pdes[1], STDIN_FILENO);
                } else {
                        if (pdes[0] != STDIN_FILENO) {
@@ -120,8 +141,11 @@ popen(command, type)
                                (void)close(pdes[0]);
                        }
                        (void)close(pdes[1]);
+                       }
+               for (p = pidlist; p; p = p->next) {
+                       (void)close(fileno(p->fp));
                }
-               execl(_PATH_BSHELL, "sh", "-c", command, NULL);
+               execve(_PATH_BSHELL, argv, environ);
                _exit(127);
                /* NOTREACHED */
        }
@@ -154,7 +178,6 @@ pclose(iop)
        FILE *iop;
 {
        register struct pid *cur, *last;
-       int omask;
        int pstat;
        pid_t pid;
 
@@ -168,7 +191,7 @@ pclose(iop)
        (void)fclose(iop);
 
        do {
-               pid = waitpid(cur->pid, &pstat, 0);
+               pid = wait4(cur->pid, &pstat, 0, (struct rusage *)0);
        } while (pid == -1 && errno == EINTR);
 
        /* Remove the entry from the linked list. */
index 2a0ec7038093016f216d9216da7b8434af63a3f1..0c95a7bb937ef337925d1fe0bd8ad2f8895bbcc0 100644 (file)
@@ -14,16 +14,16 @@ PROJECT_TYPE = Component
 
 HFILES = fp.h genassym.h
 
-OTHERLINKED = abs.s bcopy.s bzero.s ffs.s mcount.s memcpy.s\
-              memmove.s strlen.s
+OTHERLINKED = abs.s blockmoof.s bzero.s ffs.s mcount.s \
+              strlen.s
 
 CFILES = bcmp.c ecvt.c insque.c isinf.c remque.c setjmperr.c\
          strcat.c strcpy.c strncat.c strncmp.c strncpy.c
 
 OTHERSRCS = Makefile.preamble Makefile Makefile.postamble
 
-OTHERLINKEDOFILES = abs.o bcopy.o bzero.o ffs.o mcount.o memcpy.o\
-                    memmove.o strlen.o
+OTHERLINKEDOFILES = abs.o blockmoof.o bzero.o ffs.o mcount.o \
+                    strlen.o
 
 MAKEFILEDIR = $(MAKEFILEPATH)/pb_makefiles
 CODE_GEN_STYLE = DYNAMIC
index d1d80138a98cd3abd9fdc7595b7618dfb4bdcb78..6fec101167334e9783fa925b15ef756a411c6dd4 100644 (file)
@@ -5,15 +5,13 @@
         OTHER_LINKED = (
             abs.s, 
             bcmp.c, 
-            bcopy.s, 
+            blockmoof.s, 
             bzero.s, 
             ecvt.c, 
             ffs.s, 
             insque.c, 
             isinf.c, 
             mcount.s, 
-            memcpy.s, 
-            memmove.s, 
             remque.c, 
             setjmperr.c, 
             strcat.c, 
diff --git a/gen.subproj/ppc.subproj/bcopy.s b/gen.subproj/ppc.subproj/bcopy.s
deleted file mode 100644 (file)
index 38ffd42..0000000
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- */
-;
-;                      Copy bytes of data around. handles overlapped data.
-;
-;                      Change this to use Altivec later on
-;
-
-;      
-; void bcopy(from, to, nbytes)
-;
-
-;                      Use CR5_lt to indicate non-cached
-#define noncache       20
-.text
-.align 2
-#if !defined(MEMCPY) && !defined(MEMMOVE)
-.globl _bcopy
-_bcopy:
-                       crclr           noncache                                        ; Set cached
-                       cmplw           cr1,r4,r3                                       ; Compare "to" and "from"
-                       mr.             r5,r5                                           ; Check if we have a 0 length
-                       mr              r6,r3                                           ; Set source
-                       beqlr-          cr1                                             ; Bail if "to" and "from" are the same  
-                       beqlr-                                                          ; Bail if length is 0
-                       b               Lcopyit                                         ; Go copy it...
-
-;
-;                      When we move the memory, forward overlays must be handled.  We
-;                      also can not use the cache instructions if we are from bcopy_nc.
-;                      We need to preserve R3 because it needs to be returned for memcpy.
-;                      We can be interrupted and lose control here.
-;
-;                      There is no stack, so in order to used floating point, we would
-;                      need to take the FP exception. Any potential gains by using FP 
-;                      would be more than eaten up by this.
-;
-;                      Later, we should used Altivec for large moves.
-;
-
-#else
-#if defined(MEMCPY)
-.globl _memcpy
-_memcpy:
-#endif
-
-#if defined(MEMMOVE)
-.globl _memmove
-_memmove:
-#endif
-                       cmplw           cr1,r3,r4                                       ; "to" and "from" the same?
-                       mr              r6,r4                                           ; Set the "from"
-                       mr.             r5,r5                                           ; Length zero?
-                        crclr          noncache                                        ; Set cached
-                       mr              r4,r3                                           ; Set the "to"
-                       beqlr-          cr1                                             ; "to" and "from" are the same
-                       beqlr-                                                          ; Length is 0
-#endif
-Lcopyit:               sub             r12,r4,r6                                       ; Get potential overlap (negative if backward move)
-                       lis             r8,0x7FFF                                       ; Start up a mask
-                       srawi           r11,r12,31                                      ; Propagate the sign bit
-                       dcbt            0,r6                                            ; Touch in the first source line
-                       cntlzw          r7,r5                                           ; Get the highest power of 2 factor of the length
-                       ori             r8,r8,0xFFFF                                    ; Make limit 0x7FFFFFFF
-                       xor             r9,r12,r11                                      ; If sink - source was negative, invert bits
-                       srw             r8,r8,r7                                        ; Get move length limitation
-                       sub             r9,r9,r11                                       ; If sink - source was negative, add 1 and get absolute value
-                       cmplw           r12,r5                                          ; See if we actually forward overlap
-                       cmplwi          cr7,r9,32                                       ; See if at least a line between  source and sink
-                       dcbtst          0,r4                                            ; Touch in the first sink line
-                       cmplwi          cr1,r5,32                                       ; Are we moving more than a line?
-                        cror           noncache,noncache,28                            ; Set to not DCBZ output line if not enough space
-                       blt-            Lfwdovrlap                                      ; This is a forward overlapping area, handle it...
-
-;
-;                      R4 = sink
-;                      R5 = length
-;                      R6 = source
-;
-                       
-;
-;                      Here we figure out how much we have to move to get the sink onto a
-;                      cache boundary.  If we can, and there are still more that 32 bytes
-;                      left to move, we can really speed things up by DCBZing the sink line.
-;                      We can not do this if noncache is set because we will take an 
-;                      alignment exception.
-
-                       neg             r0,r4                                           ; Get the number of bytes to move to align to a line boundary
-                       rlwinm.         r0,r0,0,27,31                                   ; Clean it up and test it
-                       and             r0,r0,r8                                        ; limit to the maximum front end move
-                       mtcrf           3,r0                                            ; Make branch mask for partial moves
-                       sub             r5,r5,r0                                        ; Set the length left to move
-                       beq             Lalline                                         ; Already on a line...
-                       
-                       bf              31,Lalhalf                                      ; No single byte to do...
-                       lbz             r7,0(r6)                                        ; Get the byte
-                       addi            r6,r6,1                                         ; Point to the next
-                       stb             r7,0(r4)                                        ; Save the single
-                       addi            r4,r4,1                                         ; Bump sink
-                       
-;                      Sink is halfword aligned here
-
-Lalhalf:               bf              30,Lalword                                      ; No halfword to do...
-                       lhz             r7,0(r6)                                        ; Get the halfword
-                       addi            r6,r6,2                                         ; Point to the next
-                       sth             r7,0(r4)                                        ; Save the halfword
-                       addi            r4,r4,2                                         ; Bump sink
-                       
-;                      Sink is word aligned here
-
-Lalword:               bf              29,Laldouble                                    ; No word to do...
-                       lwz             r7,0(r6)                                        ; Get the word
-                       addi            r6,r6,4                                         ; Point to the next
-                       stw             r7,0(r4)                                        ; Save the word
-                       addi            r4,r4,4                                         ; Bump sink
-                       
-;                      Sink is double aligned here
-
-Laldouble:             bf              28,Lalquad                                      ; No double to do...
-                       lwz             r7,0(r6)                                        ; Get the first word
-                       lwz             r8,4(r6)                                        ; Get the second word
-                       addi            r6,r6,8                                         ; Point to the next
-                       stw             r7,0(r4)                                        ; Save the first word
-                       stw             r8,4(r4)                                        ; Save the second word
-                       addi            r4,r4,8                                         ; Bump sink
-                       
-;                      Sink is quadword aligned here
-
-Lalquad:                       bf              27,Lalline                                      ; No quad to do...
-                       lwz             r7,0(r6)                                        ; Get the first word
-                       lwz             r8,4(r6)                                        ; Get the second word
-                       lwz             r9,8(r6)                                        ; Get the third word
-                       stw             r7,0(r4)                                        ; Save the first word
-                       lwz             r11,12(r6)                                      ; Get the fourth word
-                       addi            r6,r6,16                                        ; Point to the next
-                       stw             r8,4(r4)                                        ; Save the second word
-                       stw             r9,8(r4)                                        ; Save the third word
-                       stw             r11,12(r4)                                      ; Save the fourth word
-                       addi            r4,r4,16                                        ; Bump sink
-                       
-;                      Sink is line aligned here
-
-Lalline:                       rlwinm.         r0,r5,27,5,31                                   ; Get the number of full lines to move
-                       mtcrf           3,r5                                            ; Make branch mask for backend partial moves
-                       rlwinm          r11,r5,0,0,26                                   ; Get number of bytes to move
-                       beq-            Lbackend                                        ; No full lines to move
-                       
-                       sub             r5,r5,r11                                       ; Calculate the residual
-                        li              r10,96                                          ; Stride for touch ahead
-
-Lnxtline:              subic.          r0,r0,1                                         ; Account for the line now
-
-                       bt-             noncache,Lskipz                                 ; Skip if we are not cached...
-                       dcbz            0,r4                                            ; Blow away the whole line because we are replacing it
-                        dcbt           r6,r10                                          ; Touch ahead a bit
-
-Lskipz:                        lwz             r7,0(r6)                                        ; Get the first word
-                       lwz             r8,4(r6)                                        ; Get the second word
-                       lwz             r9,8(r6)                                        ; Get the third word
-                       stw             r7,0(r4)                                        ; Save the first word
-                       lwz             r11,12(r6)                                      ; Get the fourth word
-                       stw             r8,4(r4)                                        ; Save the second word
-                       lwz             r7,16(r6)                                       ; Get the fifth word
-                       stw             r9,8(r4)                                        ; Save the third word
-                       lwz             r8,20(r6)                                       ; Get the sixth word
-                       stw             r11,12(r4)                                      ; Save the fourth word
-                       lwz             r9,24(r6)                                       ; Get the seventh word
-                       stw             r7,16(r4)                                       ; Save the fifth word
-                       lwz             r11,28(r6)                                      ; Get the eighth word
-                       addi            r6,r6,32                                        ; Point to the next
-                       stw             r8,20(r4)                                       ; Save the sixth word
-                       stw             r9,24(r4)                                       ; Save the seventh word
-                       stw             r11,28(r4)                                      ; Save the eighth word
-                       addi            r4,r4,32                                        ; Bump sink
-                       bgt+            Lnxtline                                        ; Do the next line, if any...
-
-       
-;                      Move backend quadword
-
-Lbackend:              bf              27,Lnoquad                                      ; No quad to do...
-                       lwz             r7,0(r6)                                        ; Get the first word
-                       lwz             r8,4(r6)                                        ; Get the second word
-                       lwz             r9,8(r6)                                        ; Get the third word
-                       lwz             r11,12(r6)                                      ; Get the fourth word
-                       stw             r7,0(r4)                                        ; Save the first word
-                       addi            r6,r6,16                                        ; Point to the next
-                       stw             r8,4(r4)                                        ; Save the second word
-                       stw             r9,8(r4)                                        ; Save the third word
-                       stw             r11,12(r4)                                      ; Save the fourth word
-                       addi            r4,r4,16                                        ; Bump sink
-                       
-;                      Move backend double
-
-Lnoquad:               bf              28,Lnodouble                                    ; No double to do...
-                       lwz             r7,0(r6)                                        ; Get the first word
-                       lwz             r8,4(r6)                                        ; Get the second word
-                       addi            r6,r6,8                                         ; Point to the next
-                       stw             r7,0(r4)                                        ; Save the first word
-                       stw             r8,4(r4)                                        ; Save the second word
-                       addi            r4,r4,8                                         ; Bump sink
-                       
-;                      Move backend word
-
-Lnodouble:             bf              29,Lnoword                                      ; No word to do...
-                       lwz             r7,0(r6)                                        ; Get the word
-                       addi            r6,r6,4                                         ; Point to the next
-                       stw             r7,0(r4)                                        ; Save the word
-                       addi            r4,r4,4                                         ; Bump sink
-                       
-;                      Move backend halfword
-
-Lnoword:                       bf              30,Lnohalf                                      ; No halfword to do...
-                       lhz             r7,0(r6)                                        ; Get the halfword
-                       addi            r6,r6,2                                         ; Point to the next
-                       sth             r7,0(r4)                                        ; Save the halfword
-                       addi            r4,r4,2                                         ; Bump sink
-
-;                      Move backend byte
-
-Lnohalf:                       bflr            31                                              ; Leave cuz we are all done...  
-                       lbz             r7,0(r6)                                        ; Get the byte
-                       stb             r7,0(r4)                                        ; Save the single
-                       
-                       blr                                                             ; Leave cuz we are all done...                  
-
-;
-;                      0123456789ABCDEF0123456789ABCDEF
-;                       0123456789ABCDEF0123456789ABCDEF
-;                                                                                  F
-;                                                                                DE
-;                                                                        9ABC
-;                                                        12345678
-;             123456789ABCDEF0 
-;            0
-
-;
-;                      Here is where we handle a forward overlapping move.  These will be slow
-;                      because we can not kill the cache of the destination until after we have
-;                      loaded/saved the source area.  Also, because reading memory backwards is
-;                      slower when the cache line needs to be loaded because the critical 
-;                      doubleword is loaded first, i.e., the last, then it goes back to the first,
-;                      and on in order.  That means that when we are at the second to last DW we
-;                      have to wait until the whole line is in cache before we can proceed.
-;
-       
-Lfwdovrlap:            add             r4,r5,r4                                        ; Point past the last sink byte
-                       add             r6,r5,r6                                        ; Point past the last source byte 
-                       and             r0,r4,r8                                        ; Apply movement limit
-                       li              r12,-1                                          ; Make sure we touch in the actual line                         
-                       mtcrf           3,r0                                            ; Figure out the best way to move backwards                     
-                       dcbt            r12,r6                                          ; Touch in the last line of source
-                       rlwinm.         r0,r0,0,27,31                                   ; Calculate the length to adjust to cache boundary
-                       dcbtst          r12,r4                                          ; Touch in the last line of the sink
-                       beq-            Lballine                                                ; Aready on cache line boundary
-                       
-                       sub             r5,r5,r0                                        ; Precaculate move length left after alignment
-                       
-                       bf              31,Lbalhalf                                     ; No single byte to do...
-                       lbz             r7,-1(r6)                                       ; Get the byte
-                       subi            r6,r6,1                                         ; Point to the next
-                       stb             r7,-1(r4)                                       ; Save the single
-                       subi            r4,r4,1                                         ; Bump sink
-                       
-;                      Sink is halfword aligned here
-
-Lbalhalf:              bf              30,Lbalword                                     ; No halfword to do...
-                       lhz             r7,-2(r6)                                       ; Get the halfword
-                       subi            r6,r6,2                                         ; Point to the next
-                       sth             r7,-2(r4)                                       ; Save the halfword
-                       subi            r4,r4,2                                         ; Bump sink
-                       
-;                      Sink is word aligned here
-
-Lbalword:              bf              29,Lbaldouble                                   ; No word to do...
-                       lwz             r7,-4(r6)                                       ; Get the word
-                       subi            r6,r6,4                                         ; Point to the next
-                       stw             r7,-4(r4)                                       ; Save the word
-                       subi            r4,r4,4                                         ; Bump sink
-                       
-;                      Sink is double aligned here
-
-Lbaldouble:            bf              28,Lbalquad                                     ; No double to do...
-                       lwz             r7,-8(r6)                                       ; Get the first word
-                       lwz             r8,-4(r6)                                       ; Get the second word
-                       subi            r6,r6,8                                         ; Point to the next
-                       stw             r7,-8(r4)                                       ; Save the first word
-                       stw             r8,-4(r4)                                       ; Save the second word
-                       subi            r4,r4,8                                         ; Bump sink
-                       
-;                      Sink is quadword aligned here
-
-Lbalquad:              bf              27,Lballine                                     ; No quad to do...
-                       lwz             r7,-16(r6)                                      ; Get the first word
-                       lwz             r8,-12(r6)                                      ; Get the second word
-                       lwz             r9,-8(r6)                                       ; Get the third word
-                       lwz             r11,-4(r6)                                      ; Get the fourth word
-                       stw             r7,-16(r4)                                      ; Save the first word
-                       subi            r6,r6,16                                        ; Point to the next
-                       stw             r8,-12(r4)                                      ; Save the second word
-                       stw             r9,-8(r4)                                       ; Save the third word
-                       stw             r11,-4(r4)                                      ; Save the fourth word
-                       subi            r4,r4,16                                        ; Bump sink
-                       
-;                      Sink is line aligned here
-
-Lballine:              rlwinm.         r0,r5,27,5,31                                   ; Get the number of full lines to move
-                       mtcrf           3,r5                                            ; Make branch mask for backend partial moves
-                       beq-            Lbbackend                                       ; No full lines to move
-
-
-;                      Registers in use:       R0, R1,     R3, R4, R5, R6
-;                      Registers not in use:           R2,                 R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
-                       
-Lbnxtline:             subic.          r0,r0,1                                         ; Account for the line now
-
-                       lwz             r7,-32(r6)                                      ; Get the first word
-                       lwz             r5,-28(r6)                                      ; Get the second word
-                       lwz             r2,-24(r6)                                      ; Get the third word
-                       lwz             r12,-20(r6)                                     ; Get the third word
-                       lwz             r11,-16(r6)                                     ; Get the fifth word
-                       lwz             r10,-12(r6)                                     ; Get the sixth word
-                       lwz             r9,-8(r6)                                       ; Get the seventh word
-                       lwz             r8,-4(r6)                                       ; Get the eighth word
-                       subi            r6,r6,32                                        ; Point to the next
-                       
-                       stw             r7,-32(r4)                                      ; Get the first word
-                       ble-            Lbnotouch                                       ; Last time, skip touch of source...
-                       dcbt            0,r6                                            ; Touch in next source line
-                       
-Lbnotouch:             stw             r5,-28(r4)                                      ; Get the second word
-                       stw             r2,-24(r4)                                      ; Get the third word
-                       stw             r12,-20(r4)                                     ; Get the third word
-                       stw             r11,-16(r4)                                     ; Get the fifth word
-                       stw             r10,-12(r4)                                     ; Get the sixth word
-                       stw             r9,-8(r4)                                       ; Get the seventh word
-                       stw             r8,-4(r4)                                       ; Get the eighth word
-                       subi            r4,r4,32                                        ; Bump sink
-                       
-                       bgt+            Lbnxtline                                       ; Do the next line, if any...
-
-;
-;                      Note: We touched these lines in at the beginning
-;
-       
-;                      Move backend quadword
-
-Lbbackend:             bf              27,Lbnoquad                                     ; No quad to do...
-                       lwz             r7,-16(r6)                                      ; Get the first word
-                       lwz             r8,-12(r6)                                      ; Get the second word
-                       lwz             r9,-8(r6)                                       ; Get the third word
-                       lwz             r11,-4(r6)                                      ; Get the fourth word
-                       stw             r7,-16(r4)                                      ; Save the first word
-                       subi            r6,r6,16                                        ; Point to the next
-                       stw             r8,-12(r4)                                      ; Save the second word
-                       stw             r9,-8(r4)                                       ; Save the third word
-                       stw             r11,-4(r4)                                      ; Save the fourth word
-                       subi            r4,r4,16                                        ; Bump sink
-                       
-;                      Move backend double
-
-Lbnoquad:              bf              28,Lbnodouble                                   ; No double to do...
-                       lwz             r7,-8(r6)                                       ; Get the first word
-                       lwz             r8,-4(r6)                                       ; Get the second word
-                       subi            r6,r6,8                                         ; Point to the next
-                       stw             r7,-8(r4)                                       ; Save the first word
-                       stw             r8,-4(r4)                                       ; Save the second word
-                       subi            r4,r4,8                                         ; Bump sink
-                       
-;                      Move backend word
-
-Lbnodouble:            bf              29,Lbnoword                                     ; No word to do...
-                       lwz             r7,-4(r6)                                       ; Get the word
-                       subi            r6,r6,4                                         ; Point to the next
-                       stw             r7,-4(r4)                                       ; Save the word
-                       subi            r4,r4,4                                         ; Bump sink
-                       
-;                      Move backend halfword
-
-Lbnoword:              bf              30,Lbnohalf                                     ; No halfword to do...
-                       lhz             r7,-2(r6)                                       ; Get the halfword
-                       subi            r6,r6,2                                         ; Point to the next
-                       sth             r7,-2(r4)                                       ; Save the halfword
-                       subi            r4,r4,2                                         ; Bump sink
-
-;                      Move backend byte
-
-Lbnohalf:              bflr            31                                              ; Leave cuz we are all done...  
-                       lbz             r7,-1(r6)                                       ; Get the byte
-                       stb             r7,-1(r4)                                       ; Save the single
-                       
-                       blr                                                             ; Leave cuz we are all done...                  
diff --git a/gen.subproj/ppc.subproj/blockmoof.s b/gen.subproj/ppc.subproj/blockmoof.s
new file mode 100755 (executable)
index 0000000..947e7f0
--- /dev/null
@@ -0,0 +1,940 @@
+/*
+ * Copyright (c) 1992-2001 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ * 
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+#include <architecture/ppc/asm_help.h>
+
+// =================================================================================================
+// *** The easiest way to assemble things on Mac OS X is via "cc", so this uses #defines and such.
+// =================================================================================================
+
+// Keep track of whether we have Altivec 
+// This gets set in pthread_init()
+
+.data
+.align 2
+.globl __cpu_has_altivec
+__cpu_has_altivec:
+.long 0
+
+.text
+.align 2
+.globl _bcopy
+.globl _memcpy
+.globl _memmove
+
+_bcopy:
+       mr      r2,r4   // Since bcopy uses (src,dest,count), swap r3,r4
+       mr      r4,r3
+       mr      r3,r2   
+_memcpy:
+_memmove:
+       mr      r2,r3   // Store dest ptr in r2 to preserve r3 on return
+
+// ------------------
+// Standard registers
+
+#define rs     r4
+#define rd     r2
+#define rc     r5
+
+// Should we bother using Altivec?
+
+       cmpwi   r5, 128
+       blt+    LScalar
+
+// Determine whether we have Altivec enabled
+
+       mflr    r0
+       bcl     20,31,1f
+1:
+       mflr    r6
+       mtlr    r0
+       addis   r6, r6, ha16(__cpu_has_altivec - 1b)
+       lwz     r6, lo16(__cpu_has_altivec - 1b)(r6)
+       cmpwi   r6, 0
+       bne+    LAltivec
+       
+// =================================================================================================
+
+//  *****************************************
+//  * S c a l a r B l o c k M o o f D a t a *
+//  *****************************************
+// 
+//  This is the scalar (non-AltiVec) version of BlockMoofData.
+// 
+//             void ScalarBlockMoofData                        (ptr sou, ptr dest, long len)
+//             void ScalarBlockMoofDataUncached        (ptr sou, ptr dest, long len)
+// 
+// 
+//  Calling Sequence:  r3 = source pointer
+//                                             r4 = destination pointer
+//                                             r5 = length in bytes
+// 
+//  Uses: all volatile registers.
+
+LScalar:
+               cmplwi  cr7,rc,32                               //  length <= 32 bytes?
+               cmplw   cr6,rd,rs                               //  up or down?
+               mr.             r0,rc                                   //  copy to r0 for MoveShort, and test for negative
+               bgt             cr7,Lbm1                                //  skip if count > 32
+               
+//  Handle short moves (<=32 bytes.)
+
+               beq             cr7,LMove32                             //  special case 32-byte blocks
+               blt             cr6,LMoveDownShort              //  move down in memory and return
+               add             rs,rs,rc                                //  moving up (right-to-left), so adjust pointers
+               add             rd,rd,rc
+               b               LMoveUpShort                    //  move up in memory and return
+
+//  Handle long moves (>32 bytes.)
+
+Lbm1:
+               beqlr   cr6                                             //  rs==rd, so nothing to move
+               bltlr   cr0                                             //  length<0, so ignore call and return
+               mflr    r12                                             //  save return address
+               bge             cr6,Lbm2                                //  rd>=rs, so move up
+
+//  Long moves down (left-to-right.)
+
+               neg             r6,rd                                   //  start to 32-byte-align destination
+               andi.   r0,r6,0x1F                              //  r0 <- bytes to move to align destination
+               bnel    LMoveDownShort                  //  align destination if necessary
+               bl              LMoveDownLong                   //  move 32-byte chunks down
+               andi.   r0,rc,0x1F                              //  done?
+               mtlr    r12                                             //  restore caller's return address
+               bne             LMoveDownShort                  //  move trailing leftover bytes and done
+               blr                                                             //  no leftovers, so done
+               
+//  Long moves up (right-to-left.)
+
+Lbm2:
+               add             rs,rs,rc                                //  moving up (right-to-left), so adjust pointers
+               add             rd,rd,rc
+               andi.   r0,rd,0x1F                              //  r0 <- bytes to move to align destination
+               bnel    LMoveUpShort                    //  align destination if necessary
+               bl              LMoveUpLong                             //  move 32-byte chunks up
+               andi.   r0,rc,0x1F                              //  done?
+               mtlr    r12                                             //  restore caller's return address
+               bne             LMoveUpShort                    //  move trailing leftover bytes and done
+               blr                                                             //  no leftovers, so done
+
+//  ***************
+//  * M O V E 3 2 *
+//  ***************
+// 
+//  Special case subroutine to move a 32-byte block.  MoveDownShort and
+//  MoveUpShort only handle 0..31 bytes, and we believe 32 bytes is too
+//  common a case to send it through the general purpose long-block code.
+//  Since it moves both up and down, we must load all 32 bytes before
+//  storing any.
+// 
+//  Calling Sequence:  rs = source ptr
+//                                      rd = destination ptr
+// 
+//  Uses: r0,r5-r11.
+// 
+
+LMove32:
+               lwz             r0,0(rs)
+               lwz             r5,4(rs)
+               lwz             r6,8(rs)
+               lwz             r7,12(rs)
+               lwz             r8,16(rs)
+               lwz             r9,20(rs)
+               lwz             r10,24(rs)
+               lwz             r11,28(rs)
+               stw             r0,0(rd)
+               stw             r5,4(rd)
+               stw             r6,8(rd)
+               stw             r7,12(rd)
+               stw             r8,16(rd)
+               stw             r9,20(rd)
+               stw             r10,24(rd)
+               stw             r11,28(rd)
+               blr
+               
+
+//  *************************
+//  * M o v e U p S h o r t *
+//  *************************
+// 
+//  Subroutine called to move <32 bytes up in memory (ie, right-to-left).
+// 
+//  Entry conditions: rs = last byte moved from source (right-to-left)
+//                                     rd = last byte moved into destination
+//                                     r0 = #bytes to move (0..31)
+// 
+//  Exit conditions:  rs = updated source ptr
+//                                     rd = updated destination ptr
+//                                     rc = decremented by #bytes moved
+// 
+//  Uses: r0,r6,r7,r8,cr7.
+// 
+
+LMoveUpShort:
+               andi.   r6,r0,0x10                              //  test 0x10 bit in length
+               mtcrf   0x1,r0                                  //  move count to cr7 so we can test bits
+               sub             rc,rc,r0                                //  decrement count of bytes remaining to be moved
+               beq             Lmus1                                   //  skip if 0x10 bit in length is 0
+               lwzu    r0,-16(rs)                              //  set, so copy up 16 bytes
+               lwz             r6,4(rs)
+               lwz             r7,8(rs)
+               lwz             r8,12(rs)
+               stwu    r0,-16(rd)
+               stw             r6,4(rd)
+               stw             r7,8(rd)
+               stw             r8,12(rd)
+
+Lmus1:
+               bf              28,Lmus2                                //  test 0x08 bit
+               lwzu    r0,-8(rs)
+               lwz             r6,4(rs)
+               stwu    r0,-8(rd)
+               stw             r6,4(rd)
+
+Lmus2:
+               bf              29,Lmus3                                //  test 0x4 bit
+               lwzu    r0,-4(rs)
+               stwu    r0,-4(rd)
+
+Lmus3:
+               bf              30,Lmus4                                //  test 0x2 bit
+               lhzu    r0,-2(rs)
+               sthu    r0,-2(rd)
+
+Lmus4:
+               bflr    31                                              //  test 0x1 bit, return if 0
+               lbzu    r0,-1(rs)
+               stbu    r0,-1(rd)
+               blr
+
+
+//  *****************************
+//  * M o v e D o w n S h o r t *
+//  *****************************
+// 
+//  Subroutine called to move <32 bytes down in memory (ie, left-to-right).
+// 
+//  Entry conditions: rs = source pointer
+//                                     rd = destination pointer
+//                                     r0 = #bytes to move (0..31)
+// 
+//  Exit conditions:  rs = ptr to 1st byte not moved
+//                                     rd = ptr to 1st byte not moved
+//                                     rc = decremented by #bytes moved
+// 
+//  Uses: r0,r6,r7,r8,cr7.
+// 
+
+LMoveDownShort:
+               andi.   r6,r0,0x10                              //  test 0x10 bit in length
+               mtcrf   0x1,r0                                  //  move count to cr7 so we can test bits
+               sub             rc,rc,r0                                //  decrement count of bytes remaining to be moved
+               beq             Lmds1                                   //  skip if 0x10 bit in length is 0
+               lwz             r0,0(rs)                                //  set, so copy up 16 bytes
+               lwz             r6,4(rs)
+               lwz             r7,8(rs)
+               lwz             r8,12(rs)
+               addi    rs,rs,16
+               stw             r0,0(rd)
+               stw             r6,4(rd)
+               stw             r7,8(rd)
+               stw             r8,12(rd)
+               addi    rd,rd,16
+
+Lmds1:
+               bf              28,Lmds2                                //  test 0x08 bit
+               lwz             r0,0(rs)
+               lwz             r6,4(rs)
+               addi    rs,rs,8
+               stw             r0,0(rd)
+               stw             r6,4(rd)
+               addi    rd,rd,8
+
+Lmds2:
+               bf              29,Lmds3                                //  test 0x4 bit
+               lwz             r0,0(rs)
+               addi    rs,rs,4
+               stw             r0,0(rd)
+               addi    rd,rd,4
+
+Lmds3:
+               bf              30,Lmds4                                //  test 0x2 bit
+               lhz             r0,0(rs)
+               addi    rs,rs,2
+               sth             r0,0(rd)
+               addi    rd,rd,2
+
+Lmds4:
+               bflr    31                                              //  test 0x1 bit, return if 0
+               lbz             r0,0(rs)
+               addi    rs,rs,1
+               stb             r0,0(rd)
+               addi    rd,rd,1
+               blr
+
+
+//  ***********************
+//  * M o v e U p L o n g *
+//  ***********************
+// 
+//  Subroutine to move 32-byte chunks of memory up (ie, right-to-left.)
+//  The destination is known to be 32-byte aligned, but the source is
+//  *not* necessarily aligned.
+// 
+//  Entry conditions: rs = last byte moved from source (right-to-left)
+//                                     rd = last byte moved into destination
+//                                     rc = count of bytes to move
+//                                     cr = crCached set iff destination is cacheable
+// 
+//  Exit conditions:  rs = updated source ptr
+//                                     rd = updated destination ptr
+//                                     rc = low order 8 bits of count of bytes to move
+// 
+//  Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
+// 
+
+LMoveUpLong:
+               srwi.   r11,rc,5                                // r11 <- #32 byte chunks to move
+               mtctr   r11                                             //  prepare loop count
+               beqlr                                                   //  return if no chunks to move
+               andi.   r0,rs,7                                 //  is source at least doubleword aligned?
+               beq             Lmup3                                   //  yes, can optimize this case
+               mtcrf   0x1,rc                                  //  save low bits of count
+               mtcrf   0x2,rc                                  //  (one cr at a time, as 604 prefers)
+
+Lmup1:                                                                 //  loop over each 32-byte-chunk
+               lwzu    r0,-32(rs)
+               subi    rd,rd,32                                //  prepare destination address for 'dcbz'
+               lwz             r5,4(rs)
+               lwz             r6,8(rs)
+               lwz             r7,12(rs)
+               lwz             r8,16(rs)
+               lwz             r9,20(rs)
+               lwz             r10,24(rs)
+               lwz             r11,28(rs)
+               stw             r0,0(rd)
+               stw             r5,4(rd)
+               stw             r6,8(rd)
+               stw             r7,12(rd)
+               stw             r8,16(rd)
+               stw             r9,20(rd)
+               stw             r10,24(rd)
+               stw             r11,28(rd)
+               bdnz    Lmup1
+               mfcr    rc                                              //  restore low bits of count
+               blr                                                             //  return to caller
+
+//  Aligned operands, so use d.p. floating point registers to move data.
+
+Lmup3:
+               lfdu    f0,-32(rs)
+               subi    rd,rd,32                                //  prepare destination address for 'dcbz'
+               lfd             f1,8(rs)
+               lfd             f2,16(rs)
+               lfd             f3,24(rs)
+               stfd    f0,0(rd)
+               stfd    f1,8(rd)
+               stfd    f2,16(rd)
+               stfd    f3,24(rd)
+               bdnz    Lmup3
+               blr                                                             //  return to caller
+               
+
+//  ***************************
+//  * M o v e D o w n L o n g *
+//  ***************************
+// 
+//  Subroutine to move 32-byte chunks of memory down (ie, left-to-right.)
+//  The destination is known to be 32-byte aligned, but the source is
+//  *not* necessarily aligned.
+// 
+//  Entry conditions: rs = source ptr (next byte to move)
+//                                     rd = dest ptr (next byte to move into)
+//                                     rc = count of bytes to move
+//                                     cr = crCached set iff destination is cacheable
+// 
+//  Exit conditions:  rs = updated source ptr
+//                                     rd = updated destination ptr
+//                                     rc = low order 8 bits of count of bytes to move
+// 
+//  Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
+// 
+
+LMoveDownLong:
+               srwi.   r11,rc,5                                // r11 <- #32 byte chunks to move
+               mtctr   r11                                             //  prepare loop count
+               beqlr                                                   //  return if no chunks to move
+               andi.   r0,rs,7                                 //  is source at least doubleword aligned?
+               beq             Lmdown3                                 //  yes, can optimize this case
+               mtcrf   0x1,rc                                  //  save low 8 bits of count
+               mtcrf   0x2,rc                                  //  (one cr at a time, as 604 prefers)
+
+Lmdown1:                                                                       //  loop over each 32-byte-chunk
+               lwz             r0,0(rs)
+               lwz             r5,4(rs)
+               lwz             r6,8(rs)
+               lwz             r7,12(rs)
+               lwz             r8,16(rs)
+               lwz             r9,20(rs)
+               lwz             r10,24(rs)
+               lwz             r11,28(rs)
+               stw             r0,0(rd)
+               stw             r5,4(rd)
+               stw             r6,8(rd)
+               stw             r7,12(rd)
+               stw             r8,16(rd)
+               stw             r9,20(rd)
+               addi    rs,rs,32
+               stw             r10,24(rd)
+               stw             r11,28(rd)
+               addi    rd,rd,32
+               bdnz    Lmdown1
+               mfcr    rc                                              //  restore low bits of count
+               blr                                                             //  return to caller
+
+//  Aligned operands, so use d.p. floating point registers to move data.
+
+Lmdown3:
+               lfd             f0,0(rs)
+               lfd             f1,8(rs)
+               lfd             f2,16(rs)
+               lfd             f3,24(rs)
+               addi    rs,rs,32
+               stfd    f0,0(rd)
+               stfd    f1,8(rd)
+               stfd    f2,16(rd)
+               stfd    f3,24(rd)
+               addi    rd,rd,32
+               bdnz    Lmdown3
+               blr                                                             //  return to caller
+
+//
+// Register use conventions are as follows:
+//
+// r0 - temp
+// r6 - copy of VMX SPR at entry
+// r7 - temp
+// r8 - constant -1 (also temp and a string op buffer)
+// r9 - constant 16 or -17 (also temp and a string op buffer)
+// r10- constant 32 or -33 (also temp and a string op buffer)
+// r11- constant 48 or -49 (also temp and a string op buffer)
+// r12- chunk count ("c") in long moves
+//
+// v0 - vp - permute vector
+// v1 - va - 1st quadword of source
+// v2 - vb - 2nd quadword of source
+// v3 - vc - 3rd quadword of source
+// v4 - vd - 4th quadword of source
+// v5 - vx - temp
+// v6 - vy - temp
+// v7 - vz - temp
+
+#define vp     v0
+#define va     v1
+#define vb     v2
+#define vc     v3
+#define vd     v4
+#define vx     v5
+#define vy     v6
+#define vz     v7
+
+#define VRSave 256
+
+// kShort should be the crossover point where the long algorithm is faster than the short.
+// WARNING: kShort must be >= 64
+
+// Yes, I know, we just checked rc > 128 to get here...
+
+#define kShort 128
+LAltivec:
+               cmpwi   cr1,rc,kShort           //(1) too short to bother using vector regs?
+               sub.    r0,rd,rs                        //(1) must move reverse if (rd-rs)<rc
+               dcbt    0,rs                            //(2) prefetch first source block
+               cmplw   cr6,r0,rc                       //(2) set cr6 blt iff we must move reverse
+               beqlr-                                          //(2) done if src==dest
+               srawi.  r9,rc,4                         //(3) r9 <- quadwords to move, test for zero
+               or              r8,rs,rd                        //(3) start to check for word alignment
+               dcbtst  0,rd                            //(4) prefetch first destination block
+               rlwinm  r8,r8,0,30,31           //(4) r8 is zero if word aligned
+               bgt-    cr1,LMoveLong           //(4) handle long operands
+               cmpwi   cr1,r8,0                        //(5) word aligned?
+               rlwinm  r7,rc,0,28,31           //(5) r7 <- leftover bytes to move after quadwords
+               bltlr-                                          //(5) done if negative count
+               blt-    cr6,LShortReverse       //(5) handle reverse moves
+               cmpwi   cr7,r7,0                        //(6) leftover bytes?
+               beq-    Leftovers                       //(6) r9==0, so no quadwords to move
+               mtctr   r9                                      //(7) set up for quadword loop
+               bne-    cr1,LUnalignedLoop      //(7) not word aligned (less common than word aligned)
+
+               
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>                         S H O R T   O P E R A N D S                        <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+               
+LAlignedLoop:                                          // word aligned operands (the common case)
+               lfd             f0,0(rs)                        //(1)
+               lfd             f1,8(rs)                        //(2)
+               addi    rs,rs,16                        //(2)
+               stfd    f0,0(rd)                        //(3)
+               stfd    f1,8(rd)                        //(4)
+               addi    rd,rd,16                        //(4)
+               bdnz    LAlignedLoop            //(4)
+               
+Leftovers:
+               beqlr-  cr7                                     //(8) done if r7==0, ie no leftover bytes
+               mtxer   r7                                      //(9) count of bytes to move (1-15)
+               lswx    r8,0,rs
+               stswx   r8,0,rd
+               blr                                                     //(17)
+
+LUnalignedLoop:                                                // not word aligned, cannot use lfd/stfd
+               lwz             r8,0(rs)                        //(1)
+               lwz             r9,4(rs)                        //(2)
+               lwz             r10,8(rs)                       //(3)
+               lwz             r11,12(rs)                      //(4)
+               addi    rs,rs,16                        //(4)
+               stw             r8,0(rd)                        //(5)
+               stw             r9,4(rd)                        //(6)
+               stw             r10,8(rd)                       //(7)
+               stw             r11,12(rd)                      //(8)
+               addi    rd,rd,16                        //(8)
+               bdnz    LUnalignedLoop          //(8)
+               
+               b               Leftovers
+               
+               
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>                   S H O R T   R E V E R S E   M O V E S                    <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+               
+               // cr0 & r9 <- #doublewords to move (>=0)
+               // cr1      <- beq if word aligned
+               //       r7 <- #leftover bytes to move (0-15)
+               
+LShortReverse:
+               cmpwi   cr7,r7,0                        // leftover bytes?
+               add             rs,rs,rc                        // point 1 past end of string for reverse moves
+               add             rd,rd,rc
+               beq-    LeftoversReverse        // r9==0, ie no words to move
+               mtctr   r9                                      // set up for quadword loop
+               bne-    cr1,LUnalignedLoopReverse
+               
+LAlignedLoopReverse:                                   // word aligned, so use lfd/stfd
+               lfd             f0,-8(rs)
+               lfdu    f1,-16(rs)
+               stfd    f0,-8(rd)
+               stfdu   f1,-16(rd)
+               bdnz    LAlignedLoopReverse
+               
+LeftoversReverse:
+               beqlr-  cr7                                     // done if r7==0, ie no leftover bytes
+               mtxer   r7                                      // count of bytes to move (1-15)
+               neg             r7,r7                           // index back by #bytes
+               lswx    r8,r7,rs
+               stswx   r8,r7,rd
+               blr
+               
+LUnalignedLoopReverse:                         // not word aligned, cannot use lfd/stfd
+               lwz             r8,-4(rs)
+               lwz     r9,-8(rs)
+               lwz             r10,-12(rs)
+               lwzu    r11,-16(rs)
+               stw             r8,-4(rd)
+               stw             r9,-8(rd)
+               stw             r10,-12(rd)
+               stwu    r11,-16(rd)
+               bdnz    LUnalignedLoopReverse
+               
+               b               LeftoversReverse
+               
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>                          L O N G   O P E R A N D S                         <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+               // cr6 set (blt) if must move reverse
+               // r0 <- (rd - rs)
+                       
+LMoveLong:                             
+               mfspr   r6,VRSave                       //(5) save caller's VMX mask register
+               stw             r6,-4(r1)                       // use CR save area so we can use r6 later
+               neg             r8,rd                           //(5) start to compute #bytes to fill in 1st dest quadword
+               rlwinm  r0,r0,0,28,31           //(6) start to determine relative alignment
+               andi.   r7,r8,0xF                       //(6) r7 <- #bytes to fill in 1st dest quadword
+               cmpwi   cr7,r0,0                        //(7) relatively aligned? (ie, 16 bytes apart?)
+               oris    r9,r6,0xFF00            //(7) light bits for regs we use (v0-v7)
+               mtspr   VRSave,r9                       //(8) update live register bitmask
+               blt-    cr6,LongReverse         //(8) must move reverse direction
+               sub             rc,rc,r7                        //(9) adjust length while we wait
+               beq-    LDest16Aligned          //(9) r7==0, ie destination already quadword aligned
+               
+               // Align destination on a quadword.
+               
+               mtxer   r7                                      //(10) set up byte count (1-15)
+               lswx    r8,0,rs                         // load into r8-r11
+               stswx   r8,0,rd                         // store r8-r11 (measured latency on arthur is 7.2 cycles)
+               add             rd,rd,r7                        //(18) adjust ptrs
+               add             rs,rs,r7                        //(18)
+               
+               // Begin preparation for inner loop and "dst" stream.
+               
+LDest16Aligned:
+        andi.  r0,rd,0x10          //(19) is destination cache-block aligned?
+               li              r9,16                           //(19) r9 <- constant used to access 2nd quadword
+               li              r10,32                          //(20) r10<- constant used to access 3rd quadword
+               beq-    cr7,LAligned            //(20) handle relatively aligned operands
+               lvx             va,0,rs                         //(20) prefetch 1st source quadword
+               li              r11,48                          //(21) r11<- constant used to access 4th quadword
+               lvsl    vp,0,rs                         //(21) get permute vector to left shift
+               beq             LDest32Aligned          //(22) destination already cache-block aligned
+               
+               // Copy 16 bytes to align destination on 32-byte (cache block) boundary
+               // to maximize store gathering.
+               
+               lvx             vb,r9,rs                        //(23) get 2nd source qw
+               subi    rc,rc,16                        //(23) adjust count
+               addi    rs,rs,16                        //(24) adjust source ptr
+               vperm   vx,va,vb,vp                     //(25) vx <- 1st destination qw
+               vor             va,vb,vb                        //(25) va <- vb
+               stvx    vx,0,rd                         //(26) assuming store Q deep enough to avoid latency
+               addi    rd,rd,16                        //(26) adjust dest ptr
+               
+               // Destination 32-byte aligned, source alignment unknown.
+
+LDest32Aligned:
+               srwi.   r12,rc,6                        //(27) r12<- count of 64-byte chunks to move
+               rlwinm  r7,rc,28,30,31          //(27) r7 <- count of 16-byte chunks to move
+               cmpwi   cr1,r7,0                        //(28) remember if any 16-byte chunks
+               rlwinm  r8,r12,0,26,31          //(29) mask chunk count down to 0-63
+               subi    r0,r8,1                         //(30) r8==0?
+               beq-    LNoChunks                       //(30) r12==0, ie no chunks to move
+               rlwimi  r8,r0,0,25,25           //(31) if r8==0, then r8 <- 64
+               li              r0,64                           //(31) r0 <- used to get 1st quadword of next chunk
+               sub.    r12,r12,r8                      //(32) adjust chunk count, set cr0
+               mtctr   r8                                      //(32) set up loop count
+               li              r8,96                           //SKP
+               li              r6,128                          //SKP
+               // Inner loop for unaligned sources.  We copy 64 bytes per iteration.
+               // We loop at most 64 times, then reprime the "dst" and loop again for
+               // the next 4KB.  This loop is tuned to keep the CPU flat out, which
+               // means we need to execute a lvx or stvx every cycle.
+               
+LoopBy64:
+               dcbt    rs,r8                           //SKP
+               dcbt    rs,r6                           //SKP
+               lvx             vb,r9,rs                        //(1) 2nd source quadword (1st already in va)
+               lvx             vc,r10,rs                       //(2) 3rd
+               lvx             vd,r11,rs                       //(3) 4th
+               vperm   vx,va,vb,vp                     //(3) vx <- 1st destination quadword
+               lvx             va,rs,r0                        //(4) get 1st qw of next 64-byte chunk (r0 must be RB!)
+               vperm   vy,vb,vc,vp                     //(4) vy <- 2nd dest qw
+               stvx    vx,0,rd                         //(5)
+               vperm   vz,vc,vd,vp                     //(5) vz <- 3rd dest qw
+               stvx    vy,r9,rd                        //(6)
+               vperm   vx,vd,va,vp                     //(6) vx <- 4th
+               stvx    vz,r10,rd                       //(7)
+               addi    rs,rs,64                        //(7)
+               stvx    vx,r11,rd                       //(8)
+               addi    rd,rd,64                        //(8)
+               bdnz    LoopBy64                        //(8)
+               
+               // End of inner loop.  Should we reprime dst stream and restart loop?
+               // This block is only executed when we're moving more than 4KB.
+               // It is usually folded out because cr0 is set in the loop prologue.
+               
+               beq+    LNoChunks                       // r12==0, ie no more chunks to move
+               sub.    r12,r12,r0                      // set cr0 if more than 4KB remain to xfer
+               mtctr   r0                                      // initialize loop count to 64
+               b               LoopBy64                        // restart inner loop, xfer another 4KB
+               
+               // Fewer than 64 bytes remain to be moved.
+               
+LNoChunks:                                                     // r7 and cr1 are set with the number of QWs
+               andi.   rc,rc,0xF                       //(33) rc <- leftover bytes
+               beq-    cr1,LCleanup            //(33) r7==0, ie fewer than 16 bytes remaining
+               mtctr   r7                                      //(34) we will loop over 1-3 QWs
+
+LoopBy16:
+               lvx             vb,r9,rs                        //(1) vb <- 2nd source quadword
+               addi    rs,rs,16                        //(1)
+               vperm   vx,va,vb,vp                     //(3) vx <- next destination quadword
+               vor             va,vb,vb                        //(3) va <- vb
+               stvx    vx,0,rd                         //(4) assuming store Q is deep enough to mask latency
+               addi    rd,rd,16                        //(4)
+               bdnz    LoopBy16                        //(4)
+               
+               // Move remaining bytes in last quadword.  rc and cr0 have the count.
+               
+LCleanup:
+               lwz             r6,-4(r1)                   // load VRSave from CR save area
+               mtspr   VRSave,r6                       //(35) restore caller's live-register bitmask
+               beqlr                                           //(36) rc==0, ie no leftovers, so done
+               mtxer   rc                                      //(37) load byte count (1-15)
+               lswx    r8,0,rs
+               stswx   r8,0,rd
+               blr                                                     //(45)
+               
+               
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>              L O N G   A L I G N E D   M O V E S                           <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+               // rs, rd <- both quadword aligned
+               // cr0 <- beq if dest is cache block (32-byte) aligned
+               // r9  <- 16
+               // r10 <- 32
+               
+LAligned:
+               lvx             va,0,rs                         // prefetch 1st source quadword
+               li              r11,48                          // r11<- constant used to access 4th quadword
+               beq             LAligned32                      // destination already cache-block aligned
+               
+               // Copy 16 bytes to align destination on 32-byte (cache block) boundary
+               // to maximize store gathering.
+               
+               subi    rc,rc,16                        // adjust count
+               addi    rs,rs,16                        // adjust source ptr
+               stvx    va,0,rd                         // assuming store Q deep enough to avoid latency
+               addi    rd,rd,16                        // adjust dest ptr
+               
+               // Destination 32-byte aligned, source 16-byte aligned.  Set up for inner loop.
+
+LAligned32:
+               srwi.   r12,rc,6                        // r12<- count of 64-byte chunks to move
+               rlwinm  r7,rc,28,30,31          // r7 <- count of 16-byte chunks to move
+               cmpwi   cr1,r7,0                        // remember if any 16-byte chunks
+               rlwinm  r8,r12,0,26,31          // mask chunk count down to 0-63
+               subi    r0,r8,1                         // r8==0?
+               beq-    LAlignedNoChunks        // r12==0, ie no chunks to move
+               rlwimi  r8,r0,0,25,25           // if r8==0, then r8 <- 64
+               li              r0,64                           // r0 <- used at end of loop
+               sub.    r12,r12,r8                      // adjust chunk count, set cr0
+               mtctr   r8                                      // set up loop count
+               li              r8,96                           //SKP
+               li              r6,128                          //SKP
+               
+               // Inner loop for aligned sources.  We copy 64 bytes per iteration.
+               
+LAlignedLoopBy64:
+               dcbt    rs,r8                           //SKP
+               dcbt    rs,r6                           //SKP
+               lvx             va,0,rs                         //(1)
+               lvx             vb,r9,rs                        //(2)
+               lvx             vc,r10,rs                       //(3)
+               lvx             vd,r11,rs                       //(4)
+               addi    rs,rs,64                        //(4)
+               stvx    va,0,rd                         //(5)
+               stvx    vb,r9,rd                        //(6)
+               stvx    vc,r10,rd                       //(7)
+               stvx    vd,r11,rd                       //(8)
+               addi    rd,rd,64                        //(8)
+               bdnz    LAlignedLoopBy64        //(8)
+               
+               // End of inner loop.  Loop again for next 4KB iff any.
+               
+               beq+    LAlignedNoChunks        // r12==0, ie no more chunks to move
+               sub.    r12,r12,r0                      // set cr0 if more than 4KB remain to xfer
+               mtctr   r0                                      // reinitialize loop count to 64
+               b               LAlignedLoopBy64        // restart inner loop, xfer another 4KB
+               
+               // Fewer than 64 bytes remain to be moved.
+               
+LAlignedNoChunks:                                      // r7 and cr1 are set with the number of QWs
+               andi.   rc,rc,0xF                       // rc <- leftover bytes
+               beq-    cr1,LCleanup            // r7==0, ie fewer than 16 bytes remaining
+               mtctr   r7                                      // we will loop over 1-3 QWs
+
+LAlignedLoopBy16:
+               lvx             va,0,rs                         // get next quadword
+               addi    rs,rs,16
+               stvx    va,0,rd
+               addi    rd,rd,16
+               bdnz    LAlignedLoopBy16
+               
+               b               LCleanup                        // handle last 0-15 bytes, if any
+
+               
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>              L O N G   R E V E R S E   M O V E S                           <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+               // Reverse moves.  These involve overlapping operands, with the source
+               // lower in memory (lower addresses) than the destination.  They must be
+               // done right-to-left, ie from high addresses down to low addresses.
+               // Throughout this code, we maintain rs and rd as pointers one byte past
+               // the end of the untransferred operands.
+               //
+               // The byte count is >=kShort and the following registers are already loaded:
+               //
+               //      r6  - VMX mask at entry
+               //      cr7 - beq if relatively aligned
+               //
+               
+LongReverse:
+               add             rd,rd,rc                        // update source/dest ptrs to be 1 byte past end
+               add             rs,rs,rc
+               andi.   r7,rd,0xF                       // r7 <- #bytes needed to move to align destination
+               sub             rc,rc,r7                        // adjust length while we wait
+               sub             rs,rs,r7                        // adjust ptrs by #bytes to xfer, also while we wait
+               sub             rd,rd,r7
+               beq-    LDest16AlignedReverse
+               
+               // Align destination on a quadword.  Note that we do NOT align on a cache
+               // block boundary for store gathering etc// since all these operands overlap
+               // many dest cache blocks will already be in the L1, so its not clear that
+               // this would be a win.
+               
+               mtxer   r7                                      // load byte count
+               lswx    r8,0,rs
+               stswx   r8,0,rd
+               
+               // Prepare for inner loop and start "dstst" stream.  Frankly, its not
+               // clear whether "dst" or "dstst" would be better// somebody should
+               // measure.  We use "dstst" because, being overlapped, at least some
+               // source cache blocks will also be stored into.
+               
+LDest16AlignedReverse:
+               srwi.   r12,rc,6                        // r12 <- count of 64-byte chunks to move
+               rlwinm  r0,rc,11,9,15           // position quadword count for dst
+               rlwinm  r11,r12,0,26,31         // mask chunk count down to 0-63
+               li              r9,-17                          // r9 <- constant used to access 2nd quadword
+               oris    r0,r0,0x0100            // set dst block size to 1 qw
+               li              r10,-33                         // r10<- constant used to access 3rd quadword
+               ori             r0,r0,0xFFE0            // set dst stride to -16 bytes
+               li              r8,-1                           // r8<- constant used to access 1st quadword
+               dstst   rs,r0,3                         // start stream 0
+               subi    r0,r11,1                        // r11==0 ?
+               lvx             va,r8,rs                        // prefetch 1st source quadword
+               rlwinm  r7,rc,28,30,31          // r7 <- count of 16-byte chunks to move
+               lvsl    vp,0,rs                         // get permute vector to right shift
+               cmpwi   cr1,r7,0                        // remember if any 16-byte chunks
+               beq-    LNoChunksReverse        // r12==0, so skip inner loop
+               rlwimi  r11,r0,0,25,25          // if r11==0, then r11 <- 64
+               sub.    r12,r12,r11                     // adjust chunk count, set cr0
+               mtctr   r11                                     // set up loop count
+               li              r11,-49                         // r11<- constant used to access 4th quadword
+               li              r0,-64                          // r0 <- used for several purposes
+               beq-    cr7,LAlignedLoopBy64Reverse
+               
+               // Inner loop for unaligned sources.  We copy 64 bytes per iteration.
+
+LoopBy64Reverse:
+               lvx             vb,r9,rs                        //(1) 2nd source quadword (1st already in va)
+               lvx             vc,r10,rs                       //(2) 3rd quadword
+               lvx             vd,r11,rs                       //(3) 4th
+               vperm   vx,vb,va,vp                     //(3) vx <- 1st destination quadword
+               lvx             va,rs,r0                        //(4) get 1st qw of next 64-byte chunk (note r0 must be RB)
+               vperm   vy,vc,vb,vp                     //(4) vy <- 2nd dest qw
+               stvx    vx,r8,rd                        //(5)
+               vperm   vz,vd,vc,vp                     //(5) vz <- 3rd destination quadword
+               stvx    vy,r9,rd                        //(6)
+               vperm   vx,va,vd,vp                     //(6) vx <- 4th qw
+               stvx    vz,r10,rd                       //(7)
+               subi    rs,rs,64                        //(7)
+               stvx    vx,r11,rd                       //(8)
+               subi    rd,rd,64                        //(8)
+               bdnz    LoopBy64Reverse         //(8)
+               
+               // End of inner loop.  Should we reprime dst stream and restart loop?
+               // This block is only executed when we're moving more than 4KB.
+               // It is usually folded out because cr0 is set in the loop prologue.
+               
+               beq+    LNoChunksReverse        // r12==0, ie no more chunks to move
+               lis             r8,0x0440                       // dst control: 64 4-qw blocks
+               add.    r12,r12,r0                      // set cr0 if more than 4KB remain to xfer
+               ori             r8,r8,0xFFC0            // stride is -64 bytes
+               dstst   rs,r8,3                         // restart the prefetch stream
+               li              r8,64                           // inner loop count
+               mtctr   r8                                      // initialize loop count to 64
+               li              r8,-1                           // restore qw1 offset for inner loop
+               b               LoopBy64Reverse         // restart inner loop, xfer another 4KB
+               
+               // Fewer than 64 bytes remain to be moved.
+               
+LNoChunksReverse:                                      // r7 and cr1 are set with the number of QWs
+               andi.   rc,rc,0xF                       // rc <- leftover bytes
+               beq-    cr1,LCleanupReverse     // r7==0, ie fewer than 16 bytes left
+               mtctr   r7
+               beq-    cr7,LAlignedLoopBy16Reverse
+
+LoopBy16Reverse:
+               lvx             vb,r9,rs                        // vb <- 2nd source quadword
+               subi    rs,rs,16
+               vperm   vx,vb,va,vp                     // vx <- next destination quadword
+               vor             va,vb,vb                        // va <- vb
+               stvx    vx,r8,rd
+               subi    rd,rd,16
+               bdnz    LoopBy16Reverse
+               
+               // Fewer that 16 bytes remain to be moved.
+               
+LCleanupReverse:                                       // rc and cr0 set with remaining byte count
+               lwz             r6,-4(r1)                       // load VRSave from CR save area
+               mtspr   VRSave,r6                       // restore caller's live-register bitmask
+               beqlr                                           // rc==0, ie no leftovers so done
+               neg             r7,rc                           // get -(#bytes)
+               mtxer   rc                                      // byte count
+               lswx    r8,r7,rs
+               stswx   r8,r7,rd
+               blr
+
+               
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><>        A L I G N E D   L O N G   R E V E R S E   M O V E S                 <><> 
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+               // Inner loop.  We copy 64 bytes per iteration.
+
+LAlignedLoopBy64Reverse:
+               lvx             va,r8,rs                        //(1)
+               lvx             vb,r9,rs                        //(2)
+               lvx             vc,r10,rs                       //(3)
+               lvx             vd,r11,rs                       //(4) 
+               subi    rs,rs,64                        //(4)
+               stvx    va,r8,rd                        //(5)
+               stvx    vb,r9,rd                        //(6)
+               stvx    vc,r10,rd                       //(7)
+               stvx    vd,r11,rd                       //(8)
+               subi    rd,rd,64                        //(8)
+               bdnz    LAlignedLoopBy64Reverse //(8)
+               
+               // End of inner loop.  Loop for next 4KB iff any.
+               
+               beq+    LNoChunksReverse        // r12==0, ie no more chunks to move
+               lis             r8,0x0440                       // dst control: 64 4-qw blocks
+               add.    r12,r12,r0                      // r12 <- r12 - 64, set cr0
+               ori             r8,r8,0xFFC0            // stride is -64 bytes
+               dstst   rs,r8,3                         // restart the prefetch stream
+               li              r8,64                           // inner loop count
+               mtctr   r8                                      // initialize loop count to 64
+               li              r8,-1                           // restore qw1 offset for inner loop
+               b               LAlignedLoopBy64Reverse
+
+               // Loop to copy leftover quadwords (1-3).
+               
+LAlignedLoopBy16Reverse:
+               lvx             va,r8,rs                        // get next qw
+               subi    rs,rs,16
+               stvx    va,r8,rd
+               subi    rd,rd,16
+               bdnz    LAlignedLoopBy16Reverse
+               
+               b               LCleanupReverse         // handle up to 15 bytes in last qw
diff --git a/gen.subproj/ppc.subproj/memcpy.s b/gen.subproj/ppc.subproj/memcpy.s
deleted file mode 100644 (file)
index 0c371f6..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- */
-#define MEMCPY
-#include "bcopy.s"
diff --git a/gen.subproj/ppc.subproj/memmove.s b/gen.subproj/ppc.subproj/memmove.s
deleted file mode 100644 (file)
index d517786..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- */
-#define MEMMOVE
-#include "bcopy.s"
index cbf6ab87ef0704ea76ad59e5657421deee3956cc..a19c7a4506a834fecf17ed94d5b9316b55efbdd7 100644 (file)
@@ -76,7 +76,8 @@ If 0 then the block is either free (in which case the size is directly at the bl
 
 #define PROTECT_SMALL                  0       // Should be 0: 1 is too slow for normal use
 
-#define LARGE_CACHE_SIZE       4       // define hysterisis of large chunks
+#define LARGE_CACHE_SIZE       1       // define hysterisis of large chunks
+#define MAX_LARGE_SIZE_TO_CACHE       (128*1024)  /* blocks larger than this are not cached */
 
 #define MAX_RECORDER_BUFFER    256
 
@@ -149,6 +150,7 @@ static size_t szone_good_size(szone_t *szone, size_t size);
 static boolean_t szone_check_all(szone_t *szone, const char *function);
 static void szone_print(szone_t *szone, boolean_t verbose);
 static INLINE region_t *region_for_ptr_no_lock(szone_t *szone, const void *ptr);
+static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry);
 
 #define LOG(szone,ptr) (szone->log_address && (szone->num_small_objects > 8) && (((unsigned)szone->log_address == -1) || (szone->log_address == (void *)(ptr))))
 
@@ -931,11 +933,9 @@ static void large_entries_grow_no_lock(szone_t *szone) {
 }
 
 static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry) {
-    // enters the specified large entry into the cache of freed entries
-    // returns a range to truly deallocate
-    vm_range_t         vm_range_to_deallocate;
+    // frees the specific entry in the size table
+    // returns a range to truly deallocate, taking into account
     vm_range_t         range;
-    vm_range_t         *range_to_use;
     range.address = LARGE_ENTRY_ADDRESS(*entry);
     range.size = LARGE_ENTRY_SIZE(*entry);
     szone->num_large_objects_in_use --;
@@ -956,6 +956,18 @@ static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry) {
         sleep(3600);
     }
 #endif
+    return range;
+}
+
+static vm_range_t large_find_better_range_to_deallocate(szone_t *szone, vm_range_t range) {
+    // enters the specified large entry into the cache of freed entries
+    // returns a range to truly deallocate
+    vm_range_t         *range_to_use;
+    vm_range_t         vm_range_to_deallocate;
+    
+    // if the specified range in larger than MAX_LARGE_SIZE_TO_CACHE the range is not cached 
+    if (range.size > MAX_LARGE_SIZE_TO_CACHE) return range;
+
     range = coalesce_range(szone->large_to_deallocate, LARGE_CACHE_SIZE, range);
     range_to_use = first_zero_range(szone->large_to_deallocate, LARGE_CACHE_SIZE);
     if (range_to_use) {
@@ -1185,6 +1197,7 @@ static void szone_free(szone_t *szone, void *ptr) {
            vm_msync(mach_task_self(), LARGE_ENTRY_ADDRESS(*entry), LARGE_ENTRY_SIZE(*entry), VM_SYNC_KILLPAGES);
        }
         vm_range_to_deallocate = large_free_no_lock(szone, entry);
+       vm_range_to_deallocate = large_find_better_range_to_deallocate(szone, vm_range_to_deallocate);
 #if DEBUG_MALLOC
         if (large_entry_for_pointer_no_lock(szone, ptr)) {
             malloc_printf("*** malloc[%d]: Just after freeing 0x%x still in use num_large_entries=%d\n", getpid(), ptr, szone->num_large_entries);
@@ -1386,12 +1399,27 @@ static void *szone_realloc(szone_t *szone, void *ptr, size_t new_size) {
        if (szone_try_realloc_in_place(szone, ptr, old_size, new_size)) return ptr;
     }
     newPtr = szone_malloc(szone, new_size);
-    if (old_size > VM_COPY_THRESHOLD) {
+    if ((old_size > VM_COPY_THRESHOLD) && (old_size < (1 << (vm_page_shift + vm_page_shift)))) {
+       // we know it's a large block, and not a huge block
         kern_return_t  err = 0;
         err = vm_copy(mach_task_self(), (vm_address_t)ptr, old_size, (vm_address_t)newPtr);
         if (err) {
             szone_error(szone, "Can't vm_copy region", ptr);
-        }
+        } else {
+           large_entry_t       *entry;
+           vm_range_t          range;
+           SZONE_LOCK(szone);
+           entry = large_entry_for_pointer_no_lock(szone, ptr);
+           if (!entry) {
+               szone_error(szone, "Can't find entry for large copied block", ptr);
+           }
+           range = large_free_no_lock(szone, entry);
+           SZONE_UNLOCK(szone); // we release the lock asap
+           // we truly deallocate_pages, including guard pages
+           deallocate_pages(szone, range.address, range.size, 0);
+           if (LOG(szone, ptr)) malloc_printf("szone_realloc returned %p for %d\n", newPtr, (unsigned)new_size);
+           return newPtr;
+       }
     } else {
         memcpy(newPtr, ptr, old_size);
     }
index 2325a7ed3c33ee3d704be85c9c34a489477ad618..631c81518817d05a3c53a65db2fe11993d63269f 100644 (file)
@@ -92,7 +92,7 @@ setrunelocale(encoding)
                return(0);
        }
 
-       if (!PathLocale && !(PathLocale = getenv("PATH_LOCALE")))
+       if (!PathLocale)
                PathLocale = _PATH_LOCALE;
 
        sprintf(name, "%s/%s/LC_CTYPE", PathLocale, encoding);
index 8011e68eeedb45807e6e9c931fd07974e98184b9..7dc8b93fc377cfd17ab2b83106e613046398cdc2 100644 (file)
@@ -105,7 +105,7 @@ setlocale(category, locale)
        int found, i, len;
        char *env, *r;
 
-       if (!PathLocale && !(PathLocale = getenv("PATH_LOCALE")))
+       if (!PathLocale)
                PathLocale = _PATH_LOCALE;
 
        if (category < 0 || category >= _LC_LAST)
index 6fac3a0bcefa9179df6250bbaa23e8abd86b7cbf..2d3bd9c982d52333bf842c8783e5f98189dd445a 100644 (file)
@@ -123,7 +123,7 @@ int mach_init_doit(int forkchild)
                _atfork_child_routine = mach_atfork_child_routine;
                 _pthread_set_self(0);
                 cthread_set_self(0);
-       }
+       }
 
        /*
         *      Initialize the single mig reply port
@@ -209,11 +209,11 @@ int fork_mach_init()
 mach_port_t
 mach_task_self()
 {
-       return(mach_task_self_);
+       return(task_self_trap());
 }
 
 mach_port_t
 mach_thread_self()
 {
        return(thread_self_trap());
-}
\ No newline at end of file
+}
index ddf28a80971b62689d3992dfca85e4bec983a27b..3a927bd926e27b8aaefac32476c0339be8621fd8 100644 (file)
@@ -55,8 +55,10 @@ extern pthread_lock_t reply_port_lock;
  */
 
 size_t _pthread_stack_size = 0;
-int _spin_tries = 1;
+int _spin_tries = 0;
+#if !defined(__ppc__)
 int _cpu_has_altivec = 0;
+#endif
 
 /* This global should be used (carefully) by anyone needing to know if a pthread has been
 ** created.
@@ -105,14 +107,6 @@ extern mach_port_t thread_recycle_port;
 
 #endif
 
-/* This is the struct used to recycle (or terminate) a thread */
-/* We stash the thread port into the reply port of the message */
-
-typedef struct {
-       mach_msg_header_t header;
-       mach_msg_trailer_t trailer;
-} recycle_msg_t;
-
 /* Set the base address to use as the stack pointer, before adjusting due to the ABI */
 
 static int
@@ -514,12 +508,6 @@ pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize)
     }
 }
 
-pthread_t _cachedThread = (pthread_t)0;
-
-void _clear_thread_cache(void) {
-    _cachedThread = (pthread_t)0;
-}
-
 /*
  * Create and start execution of a new thread.
  */
@@ -527,7 +515,6 @@ void _clear_thread_cache(void) {
 static void
 _pthread_body(pthread_t self)
 {
-    _clear_thread_cache();
     _pthread_set_self(self);
     pthread_exit((self->fun)(self->arg));
 }
@@ -721,9 +708,9 @@ pthread_detach(pthread_t thread)
                        thread->death = MACH_PORT_NULL;
                        UNLOCK(thread->lock);
                        if (num_joiners > 0)
-                       { /* Have to tell these guys this thread can't be joined with */
-                               swtch_pri(0);
-                               PTHREAD_MACH_CALL(semaphore_signal_all(thread->joiners), kern_res);
+                       {
+                               /* Wake up a joiner */
+                               PTHREAD_MACH_CALL(semaphore_signal(thread->joiners), kern_res);
                        }
                        /* Destroy 'control' semaphores */
                        PTHREAD_MACH_CALL(semaphore_destroy(mach_task_self(),
@@ -731,6 +718,10 @@ pthread_detach(pthread_t thread)
                        PTHREAD_MACH_CALL(semaphore_destroy(mach_task_self(),
                                                    death), kern_res);
                        return (ESUCCESS);
+               } else if (thread->detached == _PTHREAD_EXITED) {
+                       UNLOCK(thread->lock);
+                       pthread_join(thread, NULL);
+                       return ESUCCESS;
                } else
                {
                        UNLOCK(thread->lock);
@@ -748,16 +739,20 @@ pthread_detach(pthread_t thread)
 /* terminated, it will be yanked out from under the mach_msg() call. */
 
 static void _pthread_become_available(pthread_t thread) {
-       recycle_msg_t msg = { { 0 } };
+       mach_msg_empty_rcv_t msg = { { 0 } };
        kern_return_t ret;
 
+       if (thread->reply_port == MACH_PORT_NULL) {
+               thread->reply_port = mach_reply_port();
+       }
        msg.header.msgh_size = sizeof msg - sizeof msg.trailer;
        msg.header.msgh_remote_port = thread_recycle_port;
        msg.header.msgh_local_port = MACH_PORT_NULL; 
        msg.header.msgh_id = (int)thread;
        msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0);
-       ret = mach_msg(&msg.header, MACH_SEND_MSG, msg.header.msgh_size, 0,
-                       MACH_PORT_NULL, MACH_MSG_TIMEOUT_NONE,
+       ret = mach_msg(&msg.header, MACH_SEND_MSG | MACH_RCV_MSG,
+                       msg.header.msgh_size, sizeof msg,
+                       thread->reply_port, MACH_MSG_TIMEOUT_NONE,
                        MACH_PORT_NULL);
        while (1) {
                ret = thread_suspend(thread->kernel_thread);
@@ -767,17 +762,17 @@ static void _pthread_become_available(pthread_t thread) {
 
 /* Check to see if any threads are available. Return immediately */
 
-static kern_return_t _pthread_check_for_available_threads(recycle_msg_t *msg) {
+static kern_return_t _pthread_check_for_available_threads(mach_msg_empty_rcv_t *msg) {
        return mach_msg(&msg->header, MACH_RCV_MSG|MACH_RCV_TIMEOUT, 0,
-                       sizeof(recycle_msg_t), thread_recycle_port, 0,
+                       sizeof(mach_msg_empty_rcv_t), thread_recycle_port, 0,
                        MACH_PORT_NULL);
 }
 
 /* Terminate all available threads and deallocate their stacks */
 static void _pthread_reap_threads(void) {
        kern_return_t ret;
-       recycle_msg_t msg = { { 0 } };
-       while(_pthread_check_for_available_threads(&msg) == KERN_SUCCESS) {
+       mach_msg_empty_rcv_t msg = { { 0 } };
+       while((ret = _pthread_check_for_available_threads(&msg)) == KERN_SUCCESS) {
                pthread_t th = (pthread_t)msg.header.msgh_id;
                mach_port_t kernel_thread = th->kernel_thread;
                mach_port_t reply_port = th->reply_port; 
@@ -807,31 +802,14 @@ static void _pthread_reap_threads(void) {
                }
                free(th);
        }
+       assert(ret == MACH_RCV_TIMED_OUT);
 }
 
-
-static void *
-stackAddress(void)
-{
-    unsigned dummy;
-    return (void *)((unsigned)&dummy & ~ (PTHREAD_STACK_MIN - 1));
-}
-
-extern pthread_t _pthread_self(void);
+/* For compatibility... */
 
 pthread_t
-pthread_self(void)
-{
-    void * myStack = (void *)0;
-    pthread_t cachedThread = _cachedThread;
-    if (cachedThread) {
-        myStack = stackAddress();
-        if ((void *)((unsigned)(cachedThread->stackaddr - 1) & ~ (PTHREAD_STACK_MIN - 1)) == myStack) {
-            return cachedThread;
-        }
-    }
-    _cachedThread = _pthread_self();
-    return _cachedThread;
+_pthread_self() {
+       return pthread_self();
 }
 
 /*
@@ -844,7 +822,6 @@ pthread_exit(void *value_ptr)
         struct _pthread_handler_rec *handler;
        kern_return_t kern_res;
        int num_joiners;
-    _clear_thread_cache();
        while ((handler = self->cleanup_stack) != 0)
        {
                (handler->routine)(handler->arg);
@@ -860,10 +837,14 @@ pthread_exit(void *value_ptr)
                UNLOCK(self->lock);
                if (num_joiners > 0)
                {
-                       swtch_pri(0);
-                       PTHREAD_MACH_CALL(semaphore_signal_all(self->joiners), kern_res);
+                       /* POSIX says that multiple pthread_join() calls on */
+                       /* the same thread are undefined so we just wake up */
+                       /* the first one to join */
+                       PTHREAD_MACH_CALL(semaphore_signal(self->joiners), kern_res);
                }
-               PTHREAD_MACH_CALL(semaphore_wait(self->death), kern_res);
+               do {
+                       PTHREAD_MACH_CALL(semaphore_wait(self->death), kern_res);
+               } while (kern_res == KERN_ABORTED);
        } else
                UNLOCK(self->lock);
        /* Destroy thread & reclaim resources */
@@ -896,7 +877,9 @@ pthread_join(pthread_t thread,
                {
                        thread->num_joiners++;
                        UNLOCK(thread->lock);
-                       PTHREAD_MACH_CALL(semaphore_wait(thread->joiners), kern_res);
+                       do {
+                               PTHREAD_MACH_CALL(semaphore_wait(thread->joiners), kern_res);
+                        } while (kern_res == KERN_ABORTED);
                        LOCK(thread->lock);
                        thread->num_joiners--;
                }
@@ -909,7 +892,6 @@ pthread_join(pthread_t thread,
                                        *value_ptr = thread->exit_value;
                                }
                                UNLOCK(thread->lock);
-                               swtch_pri(0);
                                PTHREAD_MACH_CALL(semaphore_signal(thread->death), kern_res);
                                return (ESUCCESS);
                        } else
@@ -1183,14 +1165,10 @@ pthread_init(void)
        }
        attrs = &_attr;
        pthread_attr_init(attrs);
-    _clear_thread_cache();
-    _pthread_set_self(&_thread);
+       _pthread_set_self(&_thread);
 
         _pthread_create(&_thread, attrs, USRSTACK, mach_thread_self());
-        thread = (pthread_t)malloc(sizeof(struct _pthread));
-       memcpy(thread, &_thread, sizeof(struct _pthread));
-    _clear_thread_cache();
-        _pthread_set_self(thread);
+        thread = &_thread;
         thread->detached = _PTHREAD_CREATE_PARENT;
 
         /* See if we're on a multiprocessor and set _spin_tries if so.  */
@@ -1199,7 +1177,7 @@ pthread_init(void)
        len = sizeof(numcpus);
        if (sysctl(mib, 2, &numcpus, &len, NULL, 0) == 0) {
                if (numcpus > 1) {
-                       _spin_tries = SPIN_TRIES;
+                       _spin_tries = MP_SPIN_TRIES;
                }
        } else {
                count = HOST_BASIC_INFO_COUNT;
@@ -1210,7 +1188,7 @@ pthread_init(void)
                        printf("host_info failed (%d)\n", kr);
                else {
                        if (basic_info.avail_cpus > 1)
-                               _spin_tries = SPIN_TRIES;
+                               _spin_tries = MP_SPIN_TRIES;
                        /* This is a crude test */
                        if (basic_info.cpu_subtype >= CPU_SUBTYPE_POWERPC_7400) 
                                _cpu_has_altivec = 1;
index ae6ef055df870d0d938005e4b781b8892c15f903..4eee4a52dcb47de8603ae311a8c54cfc0b96939d 100644 (file)
@@ -295,7 +295,8 @@ _pthread_cond_wait(pthread_cond_t *cond,
     if ((res = pthread_mutex_lock(mutex)) != ESUCCESS) {
         return (res);
     }
-    if (kern_res == KERN_SUCCESS) {
+    /* KERN_ABORTED can be treated as a spurious wakeup */
+    if ((kern_res == KERN_SUCCESS) || (kern_res == KERN_ABORTED)) {
         return (ESUCCESS);
     } else if (kern_res == KERN_OPERATION_TIMED_OUT) {
         return (ETIMEDOUT);
index 1e96b3ca2abb4be86be620c2fa401ce269b9665f..2cfde61efc34dd0c453bbf2824e5553a1df880e9 100644 (file)
@@ -198,24 +198,33 @@ extern boolean_t swtch_pri(int);
 
 /* Number of times to spin when the lock is unavailable and we are on a
    multiprocessor.  On a uniprocessor we yield the processor immediately.  */
-#define SPIN_TRIES 10
+#define        MP_SPIN_TRIES   1000
 extern int _spin_tries;
 extern int __is_threaded;
 extern int _cpu_has_altivec;
 
 /* Internal mutex locks for data structures */
-#define TRY_LOCK(v) (!__is_threaded || _spin_lock_try((pthread_lock_t *)&v))
-#if 0
-#define LOCK(v) if (__is_threaded) _spin_lock((pthread_lock_t)&v)
-#else
-#define LOCK(v) \
-        if (__is_threaded) { \
-               while (!_spin_lock_try((pthread_lock_t *)&v)) { \
-                   syscall_thread_switch(THREAD_NULL, SWITCH_OPTION_WAIT, 1); \
-               } \
-       }
-#endif
-#define UNLOCK(v) if (__is_threaded) _spin_unlock((pthread_lock_t *)&v)
+#define TRY_LOCK(v) (!__is_threaded || _spin_lock_try((pthread_lock_t *)&(v)))
+#define LOCK(v)                                                                                                                                \
+do {                                                                                                                                           \
+       if (__is_threaded) {                                                                                                    \
+               int             tries = _spin_tries;                                                                            \
+                                                                                                                                                       \
+               while (!_spin_lock_try((pthread_lock_t *)&(v))) {                                       \
+                       if (tries-- > 0)                                                                                                \
+                               continue;                                                                                                       \
+                                                                                                                                                       \
+                       syscall_thread_switch(THREAD_NULL, SWITCH_OPTION_DEPRESS, 1);   \
+                       tries = _spin_tries;                                                                                    \
+               }                                                                                                                                       \
+       }                                                                                                                                               \
+} while (0)
+#define UNLOCK(v)                                                              \
+do {                                                                                   \
+       if (__is_threaded)                                                      \
+               _spin_unlock((pthread_lock_t *)&(v));   \
+} while (0)
+
 #ifndef ESUCCESS
 #define ESUCCESS 0
 #endif
index 1276e60f8572fa62d713c16415130dbe86035e9c..427026a78a0731c1d4ac71bfb8b2d845fe375950 100644 (file)
@@ -141,7 +141,9 @@ pthread_mutex_lock(pthread_mutex_t *mutex)
                        mutex->sem = new_sem_from_pool();
                }
                 UNLOCK(mutex->lock);
-               PTHREAD_MACH_CALL(semaphore_wait(mutex->sem), kern_res);
+               do {
+                       PTHREAD_MACH_CALL(semaphore_wait(mutex->sem), kern_res);
+               } while (kern_res == KERN_ABORTED);
                 LOCK(mutex->lock);
                mutex->waiters--;
                if (mutex->waiters == 0) {
index f819e259b1313657f8394ea02b50ab71e024ec6f..2e49b6bd4849f59d2dccee40d52de7148f0fd384 100644 (file)
@@ -276,7 +276,7 @@ __uqtoa(val, endp, base, octzero, xdigs)
 #define        BUF             (MAXEXP+MAXFRACT+1)     /* + decimal point */
 #define        DEFPREC         6
 
-static char *cvt __P((double, int, int, char *, int *, int, int *));
+static char *cvt __P((double, int, int, char *, int *, int, int *, char **));
 static int exponent __P((char *, int, int));
 
 #else /* no FLOATING_POINT */
@@ -322,6 +322,7 @@ vfprintf(fp, fmt0, ap)
        int expsize = 0;        /* character count for expstr */
        int ndig;               /* actual number of digits returned by cvt */
        char expstr[7];         /* buffer for exponent string */
+       char *dtoaresult;       /* buffer allocated by dtoa */
 #endif
        u_long  ulval = 0;      /* integer arguments %[diouxX] */
        u_quad_t uqval = 0;     /* %q integers */
@@ -428,8 +429,9 @@ vfprintf(fp, fmt0, ap)
         } else { \
                val = GETARG (int); \
         }
-        
-
+#ifdef FLOATING_POINT
+       dtoaresult = NULL;
+#endif
        /* FLOCKFILE(fp); */
        /* sorry, fprintf(read_only_file, "") returns EOF, not 0 */
        if (cantwrite(fp)) {
@@ -621,7 +623,7 @@ fp_begin:           if (prec == -1)
                        }
                        flags |= FPT;
                        cp = cvt(_double, prec, flags, &softsign,
-                               &expt, ch, &ndig);
+                               &expt, ch, &ndig, &dtoaresult);
                        if (ch == 'g' || ch == 'G') {
                                if (expt <= -4 || expt > prec)
                                        ch = (ch == 'g') ? 'e' : 'E';
@@ -877,6 +879,10 @@ number:                    if ((dprec = prec) >= 0)
 done:
        FLUSH();
 error:
+#ifdef FLOATING_POINT
+       if (dtoaresult != NULL)
+               free(dtoaresult);
+#endif
        if (__sferror(fp))
                ret = EOF;
        /* FUNLOCKFILE(fp); */
@@ -911,7 +917,7 @@ error:
  * Find all arguments when a positional parameter is encountered.  Returns a
  * table, indexed by argument number, of pointers to each arguments.  The
  * initial argument table should be an array of STATIC_ARG_TBL_SIZE entries.
- * It will be replaces with a malloc-ed on if it overflows.
+ * It will be replaces with a malloc-ed one if it overflows.
  */ 
 static void
 __find_arguments (fmt0, ap, argtable)
@@ -937,8 +943,8 @@ __find_arguments (fmt0, ap, argtable)
 #define ADDTYPE(type) \
        ((nextarg >= tablesize) ? \
                __grow_type_table(nextarg, &typetable, &tablesize) : 0, \
-       typetable[nextarg++] = type, \
-       (nextarg > tablemax) ? tablemax = nextarg : 0)
+       (nextarg > tablemax) ? tablemax = nextarg : 0, \
+       typetable[nextarg++] = type)
 
 #define        ADDSARG() \
        ((flags&LONGINT) ? ADDTYPE(T_LONG) : \
@@ -1191,33 +1197,38 @@ __grow_type_table (nextarg, typetable, tablesize)
        unsigned char **typetable;
        int *tablesize;
 {
-       unsigned char *oldtable = *typetable;
-       int newsize = *tablesize * 2;
-
-       if (*tablesize == STATIC_ARG_TBL_SIZE) {
-               *typetable = (unsigned char *)
-                   malloc (sizeof (unsigned char) * newsize);
-               bcopy (oldtable, *typetable, *tablesize);
+       unsigned char *const oldtable = *typetable;
+       const int oldsize = *tablesize;
+       unsigned char *newtable;
+       int newsize = oldsize * 2;
+
+       if (newsize < nextarg + 1)
+               newsize = nextarg + 1;
+       if (oldsize == STATIC_ARG_TBL_SIZE) {
+               if ((newtable = malloc (newsize)) == NULL)
+                       abort();        /* XXX handle better */
+               bcopy (oldtable, newtable, oldsize);
        } else {
-               *typetable = (unsigned char *)
-                   realloc (typetable, sizeof (unsigned char) * newsize);
-
+               if ((newtable = realloc (oldtable, newsize)) == NULL)
+                       abort();        /* XXX handle better */
        }
-       memset (&typetable [*tablesize], T_UNUSED, (newsize - *tablesize));
+       memset (&newtable [oldsize], T_UNUSED, (newsize - oldsize));
 
+       *typetable = newtable;
        *tablesize = newsize;
 }
 
 
 #ifdef FLOATING_POINT
 
-extern char *__dtoa __P((double, int, int, int *, int *, char **));
+extern char *__dtoa __P((double, int, int, int *, int *, char **, char **));
 
 static char *
-cvt(value, ndigits, flags, sign, decpt, ch, length)
+cvt(value, ndigits, flags, sign, decpt, ch, length, dtoaresultp)
        double value;
        int ndigits, flags, *decpt, ch, *length;
        char *sign;
+       char **dtoaresultp;
 {
        int mode, dsgn;
        char *digits, *bp, *rve;
@@ -1239,7 +1250,7 @@ cvt(value, ndigits, flags, sign, decpt, ch, length)
                *sign = '-';
        } else
                *sign = '\000';
-       digits = __dtoa(value, mode, ndigits, decpt, &dsgn, &rve);
+       digits = __dtoa(value, mode, ndigits, decpt, &dsgn, &rve, dtoaresultp);
        if ((ch != 'g' && ch != 'G') || flags & ALT) {
                /* print trailing zeros */
                bp = digits + ndigits;
index c245bfc1ba459e01247c659bb7d59d6667081a64..a12e16764c9fb1b8b3b6fff3457cd2984fbe5a57 100644 (file)
@@ -80,6 +80,7 @@
 #define        SUPPRESS        0x08    /* suppress assignment */
 #define        POINTER         0x10    /* weird %p pointer (`fake hex') */
 #define        NOSKIP          0x20    /* do not skip blanks */
+#define QUAD            0x400
 
 /*
  * The following are used in numeric conversions only:
 #define        CT_CHAR         0       /* %c conversion */
 #define        CT_CCL          1       /* %[...] conversion */
 #define        CT_STRING       2       /* %s conversion */
-#define        CT_INT          3       /* integer, i.e., strtol or strtoul */
+#define        CT_INT          3       /* integer, i.e., strtoq or strtouq */
 #define        CT_FLOAT        4       /* floating, i.e., strtod */
 
 #define u_char unsigned char
 #define u_long unsigned long
 
-static u_char *__sccl();
+static u_char *__sccl(char *, u_char *);
 
 /*
  * vfscanf
@@ -127,8 +128,8 @@ __svfscanf(fp, fmt0, ap)
        register char *p0;      /* saves original value of p when necessary */
        int nassigned;          /* number of fields assigned */
        int nread;              /* number of characters consumed from fp */
-       int base;               /* base argument to strtol/strtoul */
-       u_long (*ccfn)();       /* conversion function (strtol/strtoul) */
+       int base;               /* base argument to strtoq/strtouq */
+       u_quad_t (*ccfn)();     /* conversion function (strtoq/strtouq) */
        char ccltab[256];       /* character class table for %[...] */
        char buf[BUF];          /* buffer for numeric conversions */
 
@@ -180,6 +181,9 @@ literal:
                case 'l':
                        flags |= LONG;
                        goto again;
+               case 'q':
+                       flags |= QUAD;
+                       goto again;
                case 'L':
                        flags |= LONGDBL;
                        goto again;
@@ -204,13 +208,13 @@ literal:
                        /* FALLTHROUGH */
                case 'd':
                        c = CT_INT;
-                       ccfn = (u_long (*)())strtol;
+                       ccfn = (u_quad_t (*)())strtoq;
                        base = 10;
                        break;
 
                case 'i':
                        c = CT_INT;
-                       ccfn = (u_long (*)())strtol;
+                       ccfn = (u_quad_t (*)())strtoq;
                        base = 0;
                        break;
 
@@ -219,13 +223,13 @@ literal:
                        /* FALLTHROUGH */
                case 'o':
                        c = CT_INT;
-                       ccfn = strtoul;
+                       ccfn = strtouq;
                        base = 8;
                        break;
 
                case 'u':
                        c = CT_INT;
-                       ccfn = strtoul;
+                       ccfn = strtouq;
                        base = 10;
                        break;
 
@@ -235,7 +239,7 @@ literal:
                case 'x':
                        flags |= PFXOK; /* enable 0x prefixing */
                        c = CT_INT;
-                       ccfn = strtoul;
+                       ccfn = strtouq;
                        base = 16;
                        break;
 
@@ -267,7 +271,7 @@ literal:
                case 'p':       /* pointer format is like hex */
                        flags |= POINTER | PFXOK;
                        c = CT_INT;
-                       ccfn = strtoul;
+                       ccfn = strtouq;
                        base = 16;
                        break;
 
@@ -278,6 +282,8 @@ literal:
                                *va_arg(ap, short *) = nread;
                        else if (flags & LONG)
                                *va_arg(ap, long *) = nread;
+                       else if (flags & QUAD)
+                               *va_arg(ap, quad_t *) = nread;
                        else
                                *va_arg(ap, int *) = nread;
                        continue;
@@ -292,7 +298,7 @@ literal:
                        if (isupper(c))
                                flags |= LONG;
                        c = CT_INT;
-                       ccfn = (u_long (*)())strtol;
+                       ccfn = (u_quad_t (*)())strtoq;
                        base = 10;
                        break;
                }
@@ -434,7 +440,7 @@ literal:
                        continue;
 
                case CT_INT:
-                       /* scan an integer as if by strtol/strtoul */
+                       /* scan an integer as if by strtoq/strtouq */
 #ifdef hardway
                        if (width == 0 || width > sizeof(buf) - 1)
                                width = sizeof(buf) - 1;
@@ -552,7 +558,7 @@ literal:
                                (void) ungetc(c, fp);
                        }
                        if ((flags & SUPPRESS) == 0) {
-                               u_long res;
+                               u_quad_t res;
 
                                *p = 0;
                                res = (*ccfn)(buf, (char **)NULL, base);
@@ -562,6 +568,8 @@ literal:
                                        *va_arg(ap, short *) = res;
                                else if (flags & LONG)
                                        *va_arg(ap, long *) = res;
+                               else if (flags & QUAD)
+                                       *va_arg(ap, quad_t *) = res;
                                else
                                        *va_arg(ap, int *) = res;
                                nassigned++;
@@ -651,7 +659,9 @@ literal:
 
                                *p = 0;
                                res = strtod(buf,(char **) NULL);
-                               if (flags & LONG)
+                               if (flags & LONGDBL)
+                                       *va_arg(ap, long double *) = res;
+                               else if (flags & LONG)
                                        *va_arg(ap, double *) = res;
                                else
                                        *va_arg(ap, float *) = res;
index 0ed39d86097d505e4c81fb4baaa1639bd26a2a8d..05c075e9fb909062e6561f87225061ccf7cda09a 100644 (file)
@@ -386,7 +386,7 @@ extern double rnd_prod(double, double), rnd_quot(double, double);
 #ifdef __cplusplus
 extern "C" double strtod(const char *s00, char **se);
 extern "C" char *__dtoa(double d, int mode, int ndigits,
-                       int *decpt, int *sign, char **rve);
+                       int *decpt, int *sign, char **rve, char **resultp);
 #endif
 
  struct
@@ -398,8 +398,6 @@ Bigint {
 
  typedef struct Bigint Bigint;
 
- static Bigint *freelist[Kmax+1];
-
  static Bigint *
 Balloc
 #ifdef KR_headers
@@ -411,18 +409,13 @@ Balloc
        int x;
        Bigint *rv;
 
-       if (rv = freelist[k]) {
-               freelist[k] = rv->next;
-               }
-       else {
-               x = 1 << k;
-               rv = (Bigint *)MALLOC(sizeof(Bigint) + (x-1)*sizeof(Long));
-               rv->k = k;
-               rv->maxwds = x;
-               }
+       x = 1 << k;
+       rv = (Bigint *)malloc(sizeof(Bigint) + (x-1)*sizeof(Long));
+       rv->k = k;
+       rv->maxwds = x;
        rv->sign = rv->wds = 0;
        return rv;
-       }
+}
 
  static void
 Bfree
@@ -432,11 +425,8 @@ Bfree
        (Bigint *v)
 #endif
 {
-       if (v) {
-               v->next = freelist[v->k];
-               freelist[v->k] = v;
-               }
-       }
+       free(v);
+}
 
 #define Bcopy(x,y) memcpy((char *)&x->sign, (char *)&y->sign, \
 y->wds*sizeof(Long) + 2*sizeof(int))
@@ -1916,9 +1906,9 @@ quorem
 __dtoa
 #ifdef KR_headers
        (d, mode, ndigits, decpt, sign, rve)
-       double d; int mode, ndigits, *decpt, *sign; char **rve;
+       double d; int mode, ndigits, *decpt, *sign; char **rve, char **resultp;
 #else
-       (double d, int mode, int ndigits, int *decpt, int *sign, char **rve)
+       (double d, int mode, int ndigits, int *decpt, int *sign, char **rve, char **resultp)
 #endif
 {
  /*    Arguments ndigits, decpt, sign are similar to those
@@ -1966,15 +1956,6 @@ __dtoa
        Bigint *b, *b1, *delta, *mlo, *mhi, *S;
        double d2, ds, eps;
        char *s, *s0;
-       static Bigint *result;
-       static int result_k;
-
-       if (result) {
-               result->k = result_k;
-               result->maxwds = 1 << result_k;
-               Bfree(result);
-               result = 0;
-               }
 
        if (word0(d) & Sign_bit) {
                /* set sign for everything, including 0's and NaNs */
@@ -2136,11 +2117,8 @@ __dtoa
                        if (i <= 0)
                                i = 1;
                }
-       j = sizeof(ULong);
-       for(result_k = 0; sizeof(Bigint) - sizeof(ULong) + j <= i;
-               j <<= 1) result_k++;
-       result = Balloc(result_k);
-       s = s0 = (char *)result;
+       *resultp = (char *) malloc(i + 1);
+       s = s0 = *resultp;
 
        if (ilim >= 0 && ilim <= Quick_max && try_quick) {
 
index d925f1217b347226b11afeef5d20f5718b2252e5..657b4f964cd2fa81a2d735f0be19510ec7162a3e 100644 (file)
@@ -67,9 +67,10 @@ memccpy(t, f, c, n)
        if (n) {
                register unsigned char *tp = t;
                register const unsigned char *fp = f;
+               register unsigned char uc = c;
                do {
-                       if ((*tp++ = *fp++) == c)
-                               return (t);
+                       if ((*tp++ = *fp++) == uc)
+                               return (tp);
                } while (--n != 0);
        }
        return (0);
index d07239b50eac0d1cc4fa515bf315218c917c8252..8aa14acaee5433a75aa0e6d06140000d4b60ddca 100644 (file)
@@ -36,21 +36,26 @@ int gettimeofday (struct timeval *tp, struct timezone *tzp)
 {
         static int validtz = 0;
         static struct timezone cached_tz = {0};
+        struct timeval localtv;
+  
+        if (tzp && (tp == NULL) && (validtz == 0)) {
+                tp = &localtv;
+        }
 
         if (syscall (SYS_gettimeofday, tp, tzp) < 0) {
                 return (-1);
         }
-        if (validtz == 0)  {
-               struct tm *localtm = localtime ((time_t *)&tp->tv_sec);
-                cached_tz.tz_dsttime = localtm->tm_isdst;
-                cached_tz.tz_minuteswest =
-                        (-localtm->tm_gmtoff / SECSPERMIN) +
-                        (localtm->tm_isdst * MINSPERHOUR);
-                validtz = 1;
-        }
         if (tzp) {
-          tzp->tz_dsttime = cached_tz.tz_dsttime;
-          tzp->tz_minuteswest = cached_tz.tz_minuteswest;
+               if (validtz == 0)  {
+                       struct tm *localtm = localtime ((time_t *)&tp->tv_sec);
+                       cached_tz.tz_dsttime = localtm->tm_isdst;
+                       cached_tz.tz_minuteswest =
+                               (-localtm->tm_gmtoff / SECSPERMIN) +
+                               (localtm->tm_isdst * MINSPERHOUR);
+                       validtz = 1;
+               }
+               tzp->tz_dsttime = cached_tz.tz_dsttime;
+               tzp->tz_minuteswest = cached_tz.tz_minuteswest;
         }
         return (0);
 }
index 714a205cd565894e3f63e0319ac94b9e42c917d0..edd5f4fe5475f05abacd2d50cd856025bdaeeac2 100644 (file)
@@ -24,6 +24,7 @@
  */
 #include "SYS.h"
 
+#if 0
 LEAF(_vfork, 0) 
        CALL_EXTERN(__cthread_fork_prepare)
 #if defined(__DYNAMIC__)
@@ -161,4 +162,24 @@ L2:
        CALL_EXTERN_AGAIN(__cthread_fork_parent)
        pop     %eax
        ret             
+#else
+
+LEAF(_vfork, 0)
+        popl    %ecx
+        movl    $SYS_vfork,%eax;      // code for vfork -> eax
+        UNIX_SYSCALL_TRAP;              // do the system call
+        jnb     L1                      // jump if CF==0
+        pushl   %ecx
+        BRANCH_EXTERN(cerror)
+
+L1:
+        orl     %edx,%edx       // CF=OF=0,  ZF set if zero result
+        jz      L2              // parent, since r1 == 0 in parent, 1 in child
+        xorl    %eax,%eax       // zero eax
+        jmp     *%ecx
+
+L2:
+        jmp     *%ecx
+
+#endif
 
index 4591e54befcc5a4295f5d789703a6c136a2856f9..6bdeb02469c09cca585e8cd4a6b8fdd81c3d1da9 100644 (file)
  *     8 September 1998        Matt Watson (mwatson@apple.com)
  *             Created. Derived from longjmp.s
  */
-#include "SYS.h"
+
 #include <architecture/ppc/asm_help.h>
 #include "_setjmp.h"
 
+#define        VRSave  256
+
+/* special flag bit definitions copied from /osfmk/ppc/thread_act.h */
+
+#define floatUsedbit   1
+#define vectorUsedbit  2
+
+
+#if defined(__DYNAMIC__)
+        .data
+       .non_lazy_symbol_pointer
+       .align 2
+L_memmove$non_lazy_ptr:
+       .indirect_symbol _memmove
+       .long 0
+       .non_lazy_symbol_pointer
+       .align 2
+L__cpu_has_altivec$non_lazy_ptr:
+       .indirect_symbol __cpu_has_altivec
+       .long 0
+        .text
+#endif        
+        
 LEAF(__longjmp)
+
+        ; need to restore FPRs or VRs?
+        
+        lwz    r5,JMP_flags(r3)
+        lwz    r6,JMP_addr_at_setjmp(r3)
+        rlwinm r7,r5,0,vectorUsedbit,vectorUsedbit
+        rlwinm r8,r5,0,floatUsedbit,floatUsedbit
+        cmpw   cr1,r3,r6               ; jmp_buf still at same address?
+        cmpwi  cr3,r7,0                ; set cr3 iff VRs in use (non-volatile CR)
+        cmpwi  cr4,r8,0                ; set cr4 iff FPRs in use (non-volatile CR)
+        beq+   cr1,LRestoreVRs
+        
+        ; jmp_buf was moved since setjmp (or is uninitialized.)
+        ; We must move VRs and FPRs to be quadword aligned at present address.
+        
+        stw    r3,JMP_addr_at_setjmp(r3) ; update, in case we longjmp to this again
+        mr     r31,r4                  ; save "val" arg across memmove
+        mr     r30,r3                  ; and jmp_buf ptr
+        addi   r3,r3,JMP_vr_base_addr
+        addi   r4,r6,JMP_vr_base_addr
+        rlwinm r3,r3,0,0,27            ; r3 <- QW aligned addr where they should be
+        rlwinm r4,r4,0,0,27            ; r4 <- QW aligned addr where they originally were
+        sub    r7,r4,r6                ; r7 <- offset of VRs/FPRs within jmp_buf
+        add    r4,r30,r7               ; r4 <- where they are now
+        li     r5,(JMP_buf_end - JMP_vr_base_addr)
+#if defined(__DYNAMIC__)
+        bcl     20,31,1f               ; Get pic-base
+1:      mflr    r12                    
+        addis   r12, r12, ha16(L_memmove$non_lazy_ptr - 1b)
+        lwz     r12, lo16(L_memmove$non_lazy_ptr - 1b)(r12)
+        mtctr   r12                    ; Get address left by dyld
+        bctrl
+#else
+       bl      _memmove
+#endif
+        mr     r3,r30
+        mr     r4,r31
+        
+        ; Restore VRs iff any
+        ;      cr3 - bne if VRs
+        ;      cr4 - bne if FPRs
+        
+LRestoreVRs:
+        beq+   cr3,LZeroVRSave         ; no VRs
+        lwz    r0,JMP_vrsave(r3)
+        addi   r6,r3,JMP_vr_base_addr
+        cmpwi  r0,0                    ; any live VRs?
+        mtspr  VRSave,r0
+        beq+   LRestoreFPRs
+        lvx    v20,0,r6
+        li     r7,16*1
+        lvx    v21,r7,r6
+        li     r7,16*2
+        lvx    v22,r7,r6
+        li     r7,16*3
+        lvx    v23,r7,r6
+        li     r7,16*4
+        lvx    v24,r7,r6
+        li     r7,16*5
+        lvx    v25,r7,r6
+        li     r7,16*6
+        lvx    v26,r7,r6
+        li     r7,16*7
+        lvx    v27,r7,r6
+        li     r7,16*8
+        lvx    v28,r7,r6
+        li     r7,16*9
+        lvx    v29,r7,r6
+        li     r7,16*10
+        lvx    v30,r7,r6
+        li     r7,16*11
+        lvx    v31,r7,r6
+        b      LRestoreFPRs            ; skip zeroing VRSave
+        
+        ; Zero VRSave iff Altivec is supported, but VRs were not in use
+        ; at setjmp time.  This covers the case where VRs are first used after
+        ; the setjmp but before the longjmp, and where VRSave is nonzero at
+        ; the longjmp.  We need to zero it now, or it will always remain
+        ; nonzero since they are sticky bits.
+
+LZeroVRSave:
+#if defined(__DYNAMIC__)
+        bcl    20,31,1f
+1:     mflr    r9                      ; get our address
+        addis  r6,r9,ha16(L__cpu_has_altivec$non_lazy_ptr - 1b)
+        lwz    r7,lo16(L__cpu_has_altivec$non_lazy_ptr - 1b)(r6)
+        lwz    r7,0(r7)                ; load the flag
+#else
+        lis    r7, ha16(__cpu_has_altivec)
+       lwz     r7, lo16(__cpu_has_altivec)(r7)
+#endif
+       cmpwi   r7,0
+        li     r8,0
+        beq    LRestoreFPRs            ; no Altivec, so skip
+        mtspr  VRSave,r8
+        
+        ; Restore FPRs if any
+        ;      cr4 - bne iff FPRs
+        
+LRestoreFPRs:
+        beq    cr4,LRestoreGPRs        ; FPRs not in use at setjmp
+        addi   r6,r3,JMP_fp_base_addr
+        rlwinm r6,r6,0,0,27            ; mask off low 4 bits to qw align
+        lfd    f14,0*8(r6)
+        lfd    f15,1*8(r6)
+        lfd    f16,2*8(r6)
+        lfd    f17,3*8(r6)
+        lfd    f18,4*8(r6)
+        lfd    f19,5*8(r6)
+        lfd    f20,6*8(r6)
+        lfd    f21,7*8(r6)
+        lfd    f22,8*8(r6)
+        lfd    f23,9*8(r6)
+        lfd    f24,10*8(r6)
+        lfd    f25,11*8(r6)
+        lfd    f26,12*8(r6)
+        lfd    f27,13*8(r6)
+        lfd    f28,14*8(r6)
+        lfd    f29,15*8(r6)
+        lfd    f30,16*8(r6)
+        lfd    f31,17*8(r6)
+        
+        ; Restore GPRs
+        
+LRestoreGPRs:
        lwz r31, JMP_r31(r3)
        /* r1, r14-r30 */
        lwz r1,  JMP_r1 (r3)
index e97255c3d7172485bbef7662c0b33ddf2bd8ae9c..8a7881708541154e205f646fb18350efe69d209b 100644 (file)
  *
  */
 
+/* NOTE: jmp_bufs are only 4-byte aligned.  This means we
+ * need to pad before the VR and FPR save areas, so that they
+ * can be naturally aligned in the buffer.  In case a jmp_buf
+ * is bcopy'd to a different alignment between the setjmp
+ * and longjmp, we need to save the jmp_buf address in the
+ * jmp_buf at setjmp time, so we can realign before reloading.
+ */
 #define JMP_r1 0x00
 #define JMP_r2 0x04
 #define JMP_r13        0x08
 #define JMP_xer        0x60
 #define JMP_sig        0x64
 #define JMP_SIGFLAG 0x68
+#define JMP_flags 0x6c
+#define JMP_vrsave 0x70
+#define JMP_addr_at_setjmp 0x74
+/* 12 bytes padding here */
+#define JMP_vr_base_addr 0x84
+/* save room for 12 VRs (v20-v31), or 0xC0 bytes */
+#define JMP_fp_base_addr 0x144
+/* save room for 18 FPRs (f14-f31), or 0x90 bytes */
+#define JMP_buf_end 0x1d4
+
index 2be62c8af287407d1377edc6903803add9ccc233..c69f9ad71119e53fb2acce37e84621c5d6015881 100644 (file)
  *             Created. Derived from setjmp.s
  */
 
-#include "SYS.h"
+
 #include <architecture/ppc/asm_help.h>
 #include "_setjmp.h"
 
+#define        VRSave  256
+
+/* special flag bit definitions copied from /osfmk/ppc/thread_act.h */
+
+#define floatUsedbit   1
+#define vectorUsedbit  2
+
+#define        FlagsFastTrap   0x7FF3
+
+
 LEAF(__setjmp)
        stw r31, JMP_r31(r3)
        /* r1, r2, r13-r30 */
@@ -68,6 +78,77 @@ LEAF(__setjmp)
        stw r5, JMP_lr(r3)
        stw r6, JMP_ctr(r3)
        stw r7, JMP_xer(r3)
-       li r3, 0
+        
+        mr     r31,r3                          ; save jmp_buf ptr
+        li     r0,FlagsFastTrap
+        sc                                     ; get FPR-inuse and VR-inuse flags from kernel
+        rlwinm r4,r3,0,floatUsedbit,floatUsedbit
+        rlwinm.        r5,r3,0,vectorUsedbit,vectorUsedbit
+        cmpwi  cr1,r4,0                        ; set CR1 bne iff FPRs in use
+        stw    r3,JMP_flags(r31)
+        stw    r31,JMP_addr_at_setjmp(r31)
+        mr     r3,r31                          ; restore jmp_buf ptr
+        lwz    r31,JMP_r31(r31)
+        beq    LSaveFPRsIfNecessary            ; skip if vectorUsedbit was 0
+        
+        ; must save VRs and VRSAVE
+        
+        mfspr  r4,VRSave
+        andi.  r0,r4,0xFFF                     ; we only care about v20-v31
+        stw    r0,JMP_vrsave(r3)               ; set up effective VRSAVE
+        beq    LSaveFPRsIfNecessary            ; no live non-volatile VRs
+        addi   r6,r3,JMP_vr_base_addr
+        stvx   v20,0,r6
+        li     r4,16*1
+        stvx   v21,r4,r6
+        li     r4,16*2
+        stvx   v22,r4,r6
+        li     r4,16*3
+        stvx   v23,r4,r6
+        li     r4,16*4
+        stvx   v24,r4,r6
+        li     r4,16*5
+        stvx   v25,r4,r6
+        li     r4,16*6
+        stvx   v26,r4,r6
+        li     r4,16*7
+        stvx   v27,r4,r6
+        li     r4,16*8
+        stvx   v28,r4,r6
+        li     r4,16*9
+        stvx   v29,r4,r6
+        li     r4,16*10
+        stvx   v30,r4,r6
+        li     r4,16*11
+        stvx   v31,r4,r6
+        
+        ; must save FPRs if they are live in this thread
+        ;      CR1 = bne iff FPRs are in use
+        
+LSaveFPRsIfNecessary:
+        beq    cr1,LExit                       ; FPRs not in use
+        addi   r6,r3,JMP_fp_base_addr
+        rlwinm r6,r6,0,0,27                    ; mask off low 4 bits to qw align
+        stfd   f14,0*8(r6)
+        stfd   f15,1*8(r6)
+        stfd   f16,2*8(r6)
+        stfd   f17,3*8(r6)
+        stfd   f18,4*8(r6)
+        stfd   f19,5*8(r6)
+        stfd   f20,6*8(r6)
+        stfd   f21,7*8(r6)
+        stfd   f22,8*8(r6)
+        stfd   f23,9*8(r6)
+        stfd   f24,10*8(r6)
+        stfd   f25,11*8(r6)
+        stfd   f26,12*8(r6)
+        stfd   f27,13*8(r6)
+        stfd   f28,14*8(r6)
+        stfd   f29,15*8(r6)
+        stfd   f30,16*8(r6)
+        stfd   f31,17*8(r6)
+
+LExit:
+       li      r3, 0
        blr
 
index f3695ba60f26a5a2f2b6fd5cfe3de8e497183dfa..50ff2be0c5aef9f8653407f4f75f833aefca20b1 100644 (file)
@@ -21,8 +21,8 @@
  */
         .text
         .align 2
-        .globl __pthread_self
-__pthread_self:
+        .globl _pthread_self
+_pthread_self:
         li r0, 0x7FF2
         sc
         blr
index 6a3277acc6134db063241e69bdc06f72fa08e359..14bc4f338dbf5f0a980765ae130fdfe526962699 100644 (file)
@@ -29,7 +29,7 @@
  *
  */
 
-#if 1
+#if 0
 #import <sys/syscall.h>
 #import <architecture/ppc/asm_help.h>
 #import        <architecture/ppc/pseudo_inst.h>
index 66f86ea55bd0289fe5ce0a4965a06c6d5ec15987..a61e7576eead0df83619a03bbf52b8c15b98ed62 100644 (file)
@@ -14,7 +14,7 @@ PROJECT_TYPE = Component
 
 HFILES = cthread_internals.h cthreads.h
 
-CFILES = cprocs.c cthreads.c lu_utils.c mig_support.c threads_data.c
+CFILES = cprocs.c cthreads.c lu_utils.c mig_support.c
 
 SUBPROJECTS = i386.subproj ppc.subproj
 
index 0fbe5d19297aab4b4111f0e3e42daec576e89ce4..e63fd0773a402d85849a65e6fb1d69c0799ec783 100644 (file)
@@ -2,7 +2,7 @@
     DYNAMIC_CODE_GEN = YES; 
     FILESTABLE = {
         H_FILES = (cthread_internals.h, cthreads.h); 
-        OTHER_LINKED = (cprocs.c, cthreads.c, lu_utils.c, mig_support.c, threads_data.c); 
+        OTHER_LINKED = (cprocs.c, cthreads.c, lu_utils.c, mig_support.c); 
         OTHER_SOURCES = (Makefile.preamble, Makefile, Makefile.postamble); 
         PROJECT_HEADERS = (cthread_internals.h, cthreads.h); 
         SUBPROJECTS = (i386.subproj, ppc.subproj); 
index ee31e521a8e4bd88bcab72089cc9cd6a8931337c..64595c3e606b73825aa9604fd4289edf0342e0de 100644 (file)
@@ -48,7 +48,7 @@ _pthread_set_self(p)
 }
 
 void *
-_pthread_self()
+pthread_self()
 {
        asm("movl       $0, %eax");
        asm("lcall      $0x3b, $0");
diff --git a/threads.subproj/threads_data.c b/threads.subproj/threads_data.c
deleted file mode 100644 (file)
index 587b938..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- */
-/*
- * This file contains global data and the size of the global data can NOT
- * change or otherwise it would make the shared library incompatable.  It
- * is padded so that new data can take the place of storage occupied by part
- * of it.
- */
-int msg_send_timeout = 100;    /* milliseconds */
-int msg_receive_timeout = 10;  /* milliseconds */
-int mutex_spin_limit = 0;
-int cthread_stack_mask = 0;
-extern void cthread_init();
-unsigned int cproc_default_stack_size = 1000000;
-int condition_spin_limit = 0;
-int condition_yield_limit = 7;
-unsigned int initial_stack_boundary = 0;
-unsigned int cthread_stack_base = 0;   /* Base for stack allocation */
-int    malloc_lock = 0;                        /* 
-                                        * Needs to be shared between malloc.o
-                                        * and malloc_utils.o
-                                        */
-
-/* global data padding, must NOT be static */
-char _threads_data_padding[208] = { 0 };
index 8c9fc0e014f113b1d2beee978a9eb93cdc10d32a..aa0b2ad78e6dafba2f0bf13c454866bd90985d24 100644 (file)
@@ -82,7 +82,7 @@ int openpty(amaster, aslave, name, termp, winp)
        else
                ttygid = -1;
 
-       for (cp1 = "pqrs"; *cp1; cp1++) {
+       for (cp1 = "pqrstuvwxy"; *cp1; cp1++) {
                line[8] = *cp1;
                for (cp2 = "0123456789abcdef"; *cp2; cp2++) {
                        line[5] = 'p';