PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(PROFILE_SUFFIX)$(LIBRARY_EXT)
PRODUCTS += $(LIBRARY_PREFIX)$(NAME)$(STATIC_SUFFIX)$(LIBRARY_EXT)
RECURSIVE_FLAGS += "LINK_SUBPROJECTS = NO"
+OTHER_CFLAGS += -no-cpp-precomp -force_cpusubtype_ALL
static:
$(SILENT) unset $(CUMULATIVE_VARIABLES) ||: ; \
#include <unistd.h>
#include <limits.h>
#include <pwd.h>
+#include <stdlib.h>
/*
* UNIX password, and DES, encryption.
static unsigned char a64toi[128]; /* ascii-64 => 0..63 */
/* Initial key schedule permutation */
-static C_block PC1ROT[64/CHUNKBITS][1<<CHUNKBITS];
+// static C_block PC1ROT[64/CHUNKBITS][1<<CHUNKBITS];
+static C_block *PC1ROT;
/* Subsequent key schedule rotation permutations */
-static C_block PC2ROT[2][64/CHUNKBITS][1<<CHUNKBITS];
+// static C_block PC2ROT[2][64/CHUNKBITS][1<<CHUNKBITS];
+static C_block *PC2ROT[2];
/* Initial permutation/expansion table */
-static C_block IE3264[32/CHUNKBITS][1<<CHUNKBITS];
+// static C_block IE3264[32/CHUNKBITS][1<<CHUNKBITS];
+static C_block *IE3264;
/* Table that combines the S, P, and E operations. */
-static long SPE[2][8][64];
+// static long SPE[2][8][64];
+static long *SPE;
/* compressed/interleaved => final permutation table */
-static C_block CF6464[64/CHUNKBITS][1<<CHUNKBITS];
+// static C_block CF6464[64/CHUNKBITS][1<<CHUNKBITS];
+static C_block *CF6464;
/* ==================================== */
des_ready = 1;
}
- PERM6464(K,K0,K1,(unsigned char *)key,(C_block *)PC1ROT);
+ PERM6464(K,K0,K1,(unsigned char *)key,PC1ROT);
key = (char *)&KS[0];
STORE(K&~0x03030303L, K0&~0x03030303L, K1, *(C_block *)key);
for (i = 1; i < 16; i++) {
key += sizeof(C_block);
STORE(K,K0,K1,*(C_block *)key);
- ptabp = (C_block *)PC2ROT[Rotates[i]-1];
+ ptabp = PC2ROT[Rotates[i]-1];
PERM6464(K,K0,K1,(unsigned char *)key,ptabp);
STORE(K&~0x03030303L, K0&~0x03030303L, K1, *(C_block *)key);
}
R1 = (R1 >> 1) & 0x55555555L;
L1 = R0 | R1; /* L1 is the odd-numbered input bits */
STORE(L,L0,L1,B);
- PERM3264(L,L0,L1,B.b, (C_block *)IE3264); /* even bits */
- PERM3264(R,R0,R1,B.b+4,(C_block *)IE3264); /* odd bits */
+ PERM3264(L,L0,L1,B.b,IE3264); /* even bits */
+ PERM3264(R,R0,R1,B.b+4,IE3264); /* odd bits */
if (num_iter >= 0)
{ /* encryption */
#define SPTAB(t, i) (*(long *)((unsigned char *)t + i*(sizeof(long)/4)))
#if defined(gould)
/* use this if B.b[i] is evaluated just once ... */
-#define DOXOR(x,y,i) x^=SPTAB(SPE[0][i],B.b[i]); y^=SPTAB(SPE[1][i],B.b[i]);
+#define DOXOR(x,y,i) x^=SPTAB(&SPE[i * 64],B.b[i]); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],B.b[i]);
#else
#if defined(pdp11)
/* use this if your "long" int indexing is slow */
-#define DOXOR(x,y,i) j=B.b[i]; x^=SPTAB(SPE[0][i],j); y^=SPTAB(SPE[1][i],j);
+#define DOXOR(x,y,i) j=B.b[i]; x^=SPTAB(&SPE[i * 64],j); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],j);
#else
/* use this if "k" is allocated to a register ... */
-#define DOXOR(x,y,i) k=B.b[i]; x^=SPTAB(SPE[0][i],k); y^=SPTAB(SPE[1][i],k);
+#define DOXOR(x,y,i) k=B.b[i]; x^=SPTAB(&SPE[i * 64],k); y^=SPTAB(&SPE[(8 * 64) + (i * 64)],k);
#endif
#endif
L0 = ((L0 >> 3) & 0x0f0f0f0fL) | ((L1 << 1) & 0xf0f0f0f0L);
L1 = ((R0 >> 3) & 0x0f0f0f0fL) | ((R1 << 1) & 0xf0f0f0f0L);
STORE(L,L0,L1,B);
- PERM6464(L,L0,L1,B.b, (C_block *)CF6464);
+ PERM6464(L,L0,L1,B.b,CF6464);
#if defined(MUST_ALIGN)
STORE(L,L0,L1,B);
out[0] = B.b[0]; out[1] = B.b[1]; out[2] = B.b[2]; out[3] = B.b[3];
#ifdef DEBUG
prtab("pc1tab", perm, 8);
#endif
+ PC1ROT = (C_block *)calloc(sizeof(C_block), (64/CHUNKBITS) * (1<<CHUNKBITS));
+ for (i = 0; i < 2; i++)
+ PC2ROT[i] = (C_block *)calloc(sizeof(C_block), (64/CHUNKBITS) * (1<<CHUNKBITS));
init_perm(PC1ROT, perm, 8, 8);
/*
#ifdef DEBUG
prtab("ietab", perm, 8);
#endif
+ IE3264 = (C_block *)calloc(sizeof(C_block), (32/CHUNKBITS) * (1<<CHUNKBITS));
init_perm(IE3264, perm, 4, 8);
/*
#ifdef DEBUG
prtab("cftab", perm, 8);
#endif
+ CF6464 = (C_block *)calloc(sizeof(C_block), (64/CHUNKBITS) * (1<<CHUNKBITS));
+ SPE = (long *)calloc(sizeof(long), 2 * 8 * 64);
init_perm(CF6464, perm, 8, 8);
/*
k = 0;
for (i = 24; --i >= 0; )
k = (k<<1) | tmp32[perm[i]-1];
- TO_SIX_BIT(SPE[0][tableno][j], k);
+ TO_SIX_BIT(SPE[(tableno * 64) + j], k);
k = 0;
for (i = 24; --i >= 0; )
k = (k<<1) | tmp32[perm[i+24]-1];
- TO_SIX_BIT(SPE[1][tableno][j], k);
+ TO_SIX_BIT(SPE[(8 * 64) + (tableno * 64) + j], k);
}
}
}
* "perm" must be all-zeroes on entry to this routine.
*/
STATIC void init_perm(perm, p, chars_in, chars_out)
- C_block perm[64/CHUNKBITS][1<<CHUNKBITS];
+ C_block *perm;
unsigned char p[64];
int chars_in, chars_out;
{
l = 1<<(l&(CHUNKBITS-1)); /* mask for this bit */
for (j = 0; j < (1<<CHUNKBITS); j++) { /* each chunk value */
if ((j & l) != 0)
- perm[i][j].b[k>>3] |= 1<<(k&07);
+ perm[(i * (1<<CHUNKBITS)) + j].b[k>>3] |= 1<<(k&07);
}
}
}
* Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
- *
+ *
* The contents of this file constitute Original Code as defined in and
* are subject to the Apple Public Source License Version 1.1 (the
* "License"). You may not use this file except in compliance with the
* License. Please obtain a copy of the License at
* http://www.apple.com/publicsource and read it before using this file.
- *
+ *
* This Original Code and all software distributed under the License are
* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
* License for the specific language governing rights and limitations
* under the License.
- *
+ *
* @APPLE_LICENSE_HEADER_END@
*/
/*
* SUCH DAMAGE.
*/
-
#include <sys/param.h>
#include <sys/wait.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <string.h>
#include <paths.h>
+#include <crt_externs.h>
+
+#define environ *(_NSGetEnviron())
static struct pid {
struct pid *next;
struct pid *cur;
FILE *iop;
int pdes[2], pid, twoway;
+ char *argv[4];
+ struct pid *p;
if (strchr(type, '+')) {
twoway = 1;
type = "r+";
- if (socketpair(AF_UNIX, SOCK_STREAM, 0, pdes) < 0)
- return (NULL);
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, pdes) < 0)
+ return (NULL);
} else {
twoway = 0;
- if (*type != 'r' && *type != 'w' || type[1] ||
- (pipe(pdes) < 0))
+ if ((*type != 'r' && *type != 'w') || type[1])
return (NULL);
}
+ if (pipe(pdes) < 0)
+ return (NULL);
- if ((cur = malloc(sizeof(struct pid))) == NULL)
+ if ((cur = malloc(sizeof(struct pid))) == NULL) {
+ (void)close(pdes[0]);
+ (void)close(pdes[1]);
return (NULL);
+ }
+
+ argv[0] = "sh";
+ argv[1] = "-c";
+ argv[2] = (char *)command;
+ argv[3] = NULL;
switch (pid = vfork()) {
case -1: /* Error. */
(void)close(pdes[0]);
(void)close(pdes[1]);
- (void)free(cur);
+ free(cur);
return (NULL);
/* NOTREACHED */
case 0: /* Child. */
if (*type == 'r') {
+ /*
+ * The _dup2() to STDIN_FILENO is repeated to avoid
+ * writing to pdes[1], which might corrupt the
+ * parent's copy. This isn't good enough in
+ * general, since the _exit() is no return, so
+ * the compiler is free to corrupt all the local
+ * variables.
+ */
+ (void)close(pdes[0]);
if (pdes[1] != STDOUT_FILENO) {
(void)dup2(pdes[1], STDOUT_FILENO);
(void)close(pdes[1]);
- pdes[1] = STDOUT_FILENO;
- }
- (void) close(pdes[0]);
- if (twoway && (pdes[1] != STDIN_FILENO))
+ if (twoway)
+ (void)dup2(STDOUT_FILENO, STDIN_FILENO);
+ } else if (twoway && (pdes[1] != STDIN_FILENO))
(void)dup2(pdes[1], STDIN_FILENO);
} else {
if (pdes[0] != STDIN_FILENO) {
(void)close(pdes[0]);
}
(void)close(pdes[1]);
+ }
+ for (p = pidlist; p; p = p->next) {
+ (void)close(fileno(p->fp));
}
- execl(_PATH_BSHELL, "sh", "-c", command, NULL);
+ execve(_PATH_BSHELL, argv, environ);
_exit(127);
/* NOTREACHED */
}
FILE *iop;
{
register struct pid *cur, *last;
- int omask;
int pstat;
pid_t pid;
(void)fclose(iop);
do {
- pid = waitpid(cur->pid, &pstat, 0);
+ pid = wait4(cur->pid, &pstat, 0, (struct rusage *)0);
} while (pid == -1 && errno == EINTR);
/* Remove the entry from the linked list. */
HFILES = fp.h genassym.h
-OTHERLINKED = abs.s bcopy.s bzero.s ffs.s mcount.s memcpy.s\
- memmove.s strlen.s
+OTHERLINKED = abs.s blockmoof.s bzero.s ffs.s mcount.s \
+ strlen.s
CFILES = bcmp.c ecvt.c insque.c isinf.c remque.c setjmperr.c\
strcat.c strcpy.c strncat.c strncmp.c strncpy.c
OTHERSRCS = Makefile.preamble Makefile Makefile.postamble
-OTHERLINKEDOFILES = abs.o bcopy.o bzero.o ffs.o mcount.o memcpy.o\
- memmove.o strlen.o
+OTHERLINKEDOFILES = abs.o blockmoof.o bzero.o ffs.o mcount.o \
+ strlen.o
MAKEFILEDIR = $(MAKEFILEPATH)/pb_makefiles
CODE_GEN_STYLE = DYNAMIC
OTHER_LINKED = (
abs.s,
bcmp.c,
- bcopy.s,
+ blockmoof.s,
bzero.s,
ecvt.c,
ffs.s,
insque.c,
isinf.c,
mcount.s,
- memcpy.s,
- memmove.s,
remque.c,
setjmperr.c,
strcat.c,
+++ /dev/null
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- *
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License"). You may not use this file except in compliance with the
- * License. Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- *
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- *
- * @APPLE_LICENSE_HEADER_END@
- */
-;
-; Copy bytes of data around. handles overlapped data.
-;
-; Change this to use Altivec later on
-;
-
-;
-; void bcopy(from, to, nbytes)
-;
-
-; Use CR5_lt to indicate non-cached
-#define noncache 20
-.text
-.align 2
-#if !defined(MEMCPY) && !defined(MEMMOVE)
-.globl _bcopy
-_bcopy:
- crclr noncache ; Set cached
- cmplw cr1,r4,r3 ; Compare "to" and "from"
- mr. r5,r5 ; Check if we have a 0 length
- mr r6,r3 ; Set source
- beqlr- cr1 ; Bail if "to" and "from" are the same
- beqlr- ; Bail if length is 0
- b Lcopyit ; Go copy it...
-
-;
-; When we move the memory, forward overlays must be handled. We
-; also can not use the cache instructions if we are from bcopy_nc.
-; We need to preserve R3 because it needs to be returned for memcpy.
-; We can be interrupted and lose control here.
-;
-; There is no stack, so in order to used floating point, we would
-; need to take the FP exception. Any potential gains by using FP
-; would be more than eaten up by this.
-;
-; Later, we should used Altivec for large moves.
-;
-
-#else
-#if defined(MEMCPY)
-.globl _memcpy
-_memcpy:
-#endif
-
-#if defined(MEMMOVE)
-.globl _memmove
-_memmove:
-#endif
- cmplw cr1,r3,r4 ; "to" and "from" the same?
- mr r6,r4 ; Set the "from"
- mr. r5,r5 ; Length zero?
- crclr noncache ; Set cached
- mr r4,r3 ; Set the "to"
- beqlr- cr1 ; "to" and "from" are the same
- beqlr- ; Length is 0
-#endif
-Lcopyit: sub r12,r4,r6 ; Get potential overlap (negative if backward move)
- lis r8,0x7FFF ; Start up a mask
- srawi r11,r12,31 ; Propagate the sign bit
- dcbt 0,r6 ; Touch in the first source line
- cntlzw r7,r5 ; Get the highest power of 2 factor of the length
- ori r8,r8,0xFFFF ; Make limit 0x7FFFFFFF
- xor r9,r12,r11 ; If sink - source was negative, invert bits
- srw r8,r8,r7 ; Get move length limitation
- sub r9,r9,r11 ; If sink - source was negative, add 1 and get absolute value
- cmplw r12,r5 ; See if we actually forward overlap
- cmplwi cr7,r9,32 ; See if at least a line between source and sink
- dcbtst 0,r4 ; Touch in the first sink line
- cmplwi cr1,r5,32 ; Are we moving more than a line?
- cror noncache,noncache,28 ; Set to not DCBZ output line if not enough space
- blt- Lfwdovrlap ; This is a forward overlapping area, handle it...
-
-;
-; R4 = sink
-; R5 = length
-; R6 = source
-;
-
-;
-; Here we figure out how much we have to move to get the sink onto a
-; cache boundary. If we can, and there are still more that 32 bytes
-; left to move, we can really speed things up by DCBZing the sink line.
-; We can not do this if noncache is set because we will take an
-; alignment exception.
-
- neg r0,r4 ; Get the number of bytes to move to align to a line boundary
- rlwinm. r0,r0,0,27,31 ; Clean it up and test it
- and r0,r0,r8 ; limit to the maximum front end move
- mtcrf 3,r0 ; Make branch mask for partial moves
- sub r5,r5,r0 ; Set the length left to move
- beq Lalline ; Already on a line...
-
- bf 31,Lalhalf ; No single byte to do...
- lbz r7,0(r6) ; Get the byte
- addi r6,r6,1 ; Point to the next
- stb r7,0(r4) ; Save the single
- addi r4,r4,1 ; Bump sink
-
-; Sink is halfword aligned here
-
-Lalhalf: bf 30,Lalword ; No halfword to do...
- lhz r7,0(r6) ; Get the halfword
- addi r6,r6,2 ; Point to the next
- sth r7,0(r4) ; Save the halfword
- addi r4,r4,2 ; Bump sink
-
-; Sink is word aligned here
-
-Lalword: bf 29,Laldouble ; No word to do...
- lwz r7,0(r6) ; Get the word
- addi r6,r6,4 ; Point to the next
- stw r7,0(r4) ; Save the word
- addi r4,r4,4 ; Bump sink
-
-; Sink is double aligned here
-
-Laldouble: bf 28,Lalquad ; No double to do...
- lwz r7,0(r6) ; Get the first word
- lwz r8,4(r6) ; Get the second word
- addi r6,r6,8 ; Point to the next
- stw r7,0(r4) ; Save the first word
- stw r8,4(r4) ; Save the second word
- addi r4,r4,8 ; Bump sink
-
-; Sink is quadword aligned here
-
-Lalquad: bf 27,Lalline ; No quad to do...
- lwz r7,0(r6) ; Get the first word
- lwz r8,4(r6) ; Get the second word
- lwz r9,8(r6) ; Get the third word
- stw r7,0(r4) ; Save the first word
- lwz r11,12(r6) ; Get the fourth word
- addi r6,r6,16 ; Point to the next
- stw r8,4(r4) ; Save the second word
- stw r9,8(r4) ; Save the third word
- stw r11,12(r4) ; Save the fourth word
- addi r4,r4,16 ; Bump sink
-
-; Sink is line aligned here
-
-Lalline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
- mtcrf 3,r5 ; Make branch mask for backend partial moves
- rlwinm r11,r5,0,0,26 ; Get number of bytes to move
- beq- Lbackend ; No full lines to move
-
- sub r5,r5,r11 ; Calculate the residual
- li r10,96 ; Stride for touch ahead
-
-Lnxtline: subic. r0,r0,1 ; Account for the line now
-
- bt- noncache,Lskipz ; Skip if we are not cached...
- dcbz 0,r4 ; Blow away the whole line because we are replacing it
- dcbt r6,r10 ; Touch ahead a bit
-
-Lskipz: lwz r7,0(r6) ; Get the first word
- lwz r8,4(r6) ; Get the second word
- lwz r9,8(r6) ; Get the third word
- stw r7,0(r4) ; Save the first word
- lwz r11,12(r6) ; Get the fourth word
- stw r8,4(r4) ; Save the second word
- lwz r7,16(r6) ; Get the fifth word
- stw r9,8(r4) ; Save the third word
- lwz r8,20(r6) ; Get the sixth word
- stw r11,12(r4) ; Save the fourth word
- lwz r9,24(r6) ; Get the seventh word
- stw r7,16(r4) ; Save the fifth word
- lwz r11,28(r6) ; Get the eighth word
- addi r6,r6,32 ; Point to the next
- stw r8,20(r4) ; Save the sixth word
- stw r9,24(r4) ; Save the seventh word
- stw r11,28(r4) ; Save the eighth word
- addi r4,r4,32 ; Bump sink
- bgt+ Lnxtline ; Do the next line, if any...
-
-
-; Move backend quadword
-
-Lbackend: bf 27,Lnoquad ; No quad to do...
- lwz r7,0(r6) ; Get the first word
- lwz r8,4(r6) ; Get the second word
- lwz r9,8(r6) ; Get the third word
- lwz r11,12(r6) ; Get the fourth word
- stw r7,0(r4) ; Save the first word
- addi r6,r6,16 ; Point to the next
- stw r8,4(r4) ; Save the second word
- stw r9,8(r4) ; Save the third word
- stw r11,12(r4) ; Save the fourth word
- addi r4,r4,16 ; Bump sink
-
-; Move backend double
-
-Lnoquad: bf 28,Lnodouble ; No double to do...
- lwz r7,0(r6) ; Get the first word
- lwz r8,4(r6) ; Get the second word
- addi r6,r6,8 ; Point to the next
- stw r7,0(r4) ; Save the first word
- stw r8,4(r4) ; Save the second word
- addi r4,r4,8 ; Bump sink
-
-; Move backend word
-
-Lnodouble: bf 29,Lnoword ; No word to do...
- lwz r7,0(r6) ; Get the word
- addi r6,r6,4 ; Point to the next
- stw r7,0(r4) ; Save the word
- addi r4,r4,4 ; Bump sink
-
-; Move backend halfword
-
-Lnoword: bf 30,Lnohalf ; No halfword to do...
- lhz r7,0(r6) ; Get the halfword
- addi r6,r6,2 ; Point to the next
- sth r7,0(r4) ; Save the halfword
- addi r4,r4,2 ; Bump sink
-
-; Move backend byte
-
-Lnohalf: bflr 31 ; Leave cuz we are all done...
- lbz r7,0(r6) ; Get the byte
- stb r7,0(r4) ; Save the single
-
- blr ; Leave cuz we are all done...
-
-;
-; 0123456789ABCDEF0123456789ABCDEF
-; 0123456789ABCDEF0123456789ABCDEF
-; F
-; DE
-; 9ABC
-; 12345678
-; 123456789ABCDEF0
-; 0
-
-;
-; Here is where we handle a forward overlapping move. These will be slow
-; because we can not kill the cache of the destination until after we have
-; loaded/saved the source area. Also, because reading memory backwards is
-; slower when the cache line needs to be loaded because the critical
-; doubleword is loaded first, i.e., the last, then it goes back to the first,
-; and on in order. That means that when we are at the second to last DW we
-; have to wait until the whole line is in cache before we can proceed.
-;
-
-Lfwdovrlap: add r4,r5,r4 ; Point past the last sink byte
- add r6,r5,r6 ; Point past the last source byte
- and r0,r4,r8 ; Apply movement limit
- li r12,-1 ; Make sure we touch in the actual line
- mtcrf 3,r0 ; Figure out the best way to move backwards
- dcbt r12,r6 ; Touch in the last line of source
- rlwinm. r0,r0,0,27,31 ; Calculate the length to adjust to cache boundary
- dcbtst r12,r4 ; Touch in the last line of the sink
- beq- Lballine ; Aready on cache line boundary
-
- sub r5,r5,r0 ; Precaculate move length left after alignment
-
- bf 31,Lbalhalf ; No single byte to do...
- lbz r7,-1(r6) ; Get the byte
- subi r6,r6,1 ; Point to the next
- stb r7,-1(r4) ; Save the single
- subi r4,r4,1 ; Bump sink
-
-; Sink is halfword aligned here
-
-Lbalhalf: bf 30,Lbalword ; No halfword to do...
- lhz r7,-2(r6) ; Get the halfword
- subi r6,r6,2 ; Point to the next
- sth r7,-2(r4) ; Save the halfword
- subi r4,r4,2 ; Bump sink
-
-; Sink is word aligned here
-
-Lbalword: bf 29,Lbaldouble ; No word to do...
- lwz r7,-4(r6) ; Get the word
- subi r6,r6,4 ; Point to the next
- stw r7,-4(r4) ; Save the word
- subi r4,r4,4 ; Bump sink
-
-; Sink is double aligned here
-
-Lbaldouble: bf 28,Lbalquad ; No double to do...
- lwz r7,-8(r6) ; Get the first word
- lwz r8,-4(r6) ; Get the second word
- subi r6,r6,8 ; Point to the next
- stw r7,-8(r4) ; Save the first word
- stw r8,-4(r4) ; Save the second word
- subi r4,r4,8 ; Bump sink
-
-; Sink is quadword aligned here
-
-Lbalquad: bf 27,Lballine ; No quad to do...
- lwz r7,-16(r6) ; Get the first word
- lwz r8,-12(r6) ; Get the second word
- lwz r9,-8(r6) ; Get the third word
- lwz r11,-4(r6) ; Get the fourth word
- stw r7,-16(r4) ; Save the first word
- subi r6,r6,16 ; Point to the next
- stw r8,-12(r4) ; Save the second word
- stw r9,-8(r4) ; Save the third word
- stw r11,-4(r4) ; Save the fourth word
- subi r4,r4,16 ; Bump sink
-
-; Sink is line aligned here
-
-Lballine: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
- mtcrf 3,r5 ; Make branch mask for backend partial moves
- beq- Lbbackend ; No full lines to move
-
-
-; Registers in use: R0, R1, R3, R4, R5, R6
-; Registers not in use: R2, R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
-
-Lbnxtline: subic. r0,r0,1 ; Account for the line now
-
- lwz r7,-32(r6) ; Get the first word
- lwz r5,-28(r6) ; Get the second word
- lwz r2,-24(r6) ; Get the third word
- lwz r12,-20(r6) ; Get the third word
- lwz r11,-16(r6) ; Get the fifth word
- lwz r10,-12(r6) ; Get the sixth word
- lwz r9,-8(r6) ; Get the seventh word
- lwz r8,-4(r6) ; Get the eighth word
- subi r6,r6,32 ; Point to the next
-
- stw r7,-32(r4) ; Get the first word
- ble- Lbnotouch ; Last time, skip touch of source...
- dcbt 0,r6 ; Touch in next source line
-
-Lbnotouch: stw r5,-28(r4) ; Get the second word
- stw r2,-24(r4) ; Get the third word
- stw r12,-20(r4) ; Get the third word
- stw r11,-16(r4) ; Get the fifth word
- stw r10,-12(r4) ; Get the sixth word
- stw r9,-8(r4) ; Get the seventh word
- stw r8,-4(r4) ; Get the eighth word
- subi r4,r4,32 ; Bump sink
-
- bgt+ Lbnxtline ; Do the next line, if any...
-
-;
-; Note: We touched these lines in at the beginning
-;
-
-; Move backend quadword
-
-Lbbackend: bf 27,Lbnoquad ; No quad to do...
- lwz r7,-16(r6) ; Get the first word
- lwz r8,-12(r6) ; Get the second word
- lwz r9,-8(r6) ; Get the third word
- lwz r11,-4(r6) ; Get the fourth word
- stw r7,-16(r4) ; Save the first word
- subi r6,r6,16 ; Point to the next
- stw r8,-12(r4) ; Save the second word
- stw r9,-8(r4) ; Save the third word
- stw r11,-4(r4) ; Save the fourth word
- subi r4,r4,16 ; Bump sink
-
-; Move backend double
-
-Lbnoquad: bf 28,Lbnodouble ; No double to do...
- lwz r7,-8(r6) ; Get the first word
- lwz r8,-4(r6) ; Get the second word
- subi r6,r6,8 ; Point to the next
- stw r7,-8(r4) ; Save the first word
- stw r8,-4(r4) ; Save the second word
- subi r4,r4,8 ; Bump sink
-
-; Move backend word
-
-Lbnodouble: bf 29,Lbnoword ; No word to do...
- lwz r7,-4(r6) ; Get the word
- subi r6,r6,4 ; Point to the next
- stw r7,-4(r4) ; Save the word
- subi r4,r4,4 ; Bump sink
-
-; Move backend halfword
-
-Lbnoword: bf 30,Lbnohalf ; No halfword to do...
- lhz r7,-2(r6) ; Get the halfword
- subi r6,r6,2 ; Point to the next
- sth r7,-2(r4) ; Save the halfword
- subi r4,r4,2 ; Bump sink
-
-; Move backend byte
-
-Lbnohalf: bflr 31 ; Leave cuz we are all done...
- lbz r7,-1(r6) ; Get the byte
- stb r7,-1(r4) ; Save the single
-
- blr ; Leave cuz we are all done...
--- /dev/null
+/*
+ * Copyright (c) 1992-2001 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License"). You may not use this file except in compliance with the
+ * License. Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ *
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+#include <architecture/ppc/asm_help.h>
+
+// =================================================================================================
+// *** The easiest way to assemble things on Mac OS X is via "cc", so this uses #defines and such.
+// =================================================================================================
+
+// Keep track of whether we have Altivec
+// This gets set in pthread_init()
+
+.data
+.align 2
+.globl __cpu_has_altivec
+__cpu_has_altivec:
+.long 0
+
+.text
+.align 2
+.globl _bcopy
+.globl _memcpy
+.globl _memmove
+
+_bcopy:
+ mr r2,r4 // Since bcopy uses (src,dest,count), swap r3,r4
+ mr r4,r3
+ mr r3,r2
+_memcpy:
+_memmove:
+ mr r2,r3 // Store dest ptr in r2 to preserve r3 on return
+
+// ------------------
+// Standard registers
+
+#define rs r4
+#define rd r2
+#define rc r5
+
+// Should we bother using Altivec?
+
+ cmpwi r5, 128
+ blt+ LScalar
+
+// Determine whether we have Altivec enabled
+
+ mflr r0
+ bcl 20,31,1f
+1:
+ mflr r6
+ mtlr r0
+ addis r6, r6, ha16(__cpu_has_altivec - 1b)
+ lwz r6, lo16(__cpu_has_altivec - 1b)(r6)
+ cmpwi r6, 0
+ bne+ LAltivec
+
+// =================================================================================================
+
+// *****************************************
+// * S c a l a r B l o c k M o o f D a t a *
+// *****************************************
+//
+// This is the scalar (non-AltiVec) version of BlockMoofData.
+//
+// void ScalarBlockMoofData (ptr sou, ptr dest, long len)
+// void ScalarBlockMoofDataUncached (ptr sou, ptr dest, long len)
+//
+//
+// Calling Sequence: r3 = source pointer
+// r4 = destination pointer
+// r5 = length in bytes
+//
+// Uses: all volatile registers.
+
+LScalar:
+ cmplwi cr7,rc,32 // length <= 32 bytes?
+ cmplw cr6,rd,rs // up or down?
+ mr. r0,rc // copy to r0 for MoveShort, and test for negative
+ bgt cr7,Lbm1 // skip if count > 32
+
+// Handle short moves (<=32 bytes.)
+
+ beq cr7,LMove32 // special case 32-byte blocks
+ blt cr6,LMoveDownShort // move down in memory and return
+ add rs,rs,rc // moving up (right-to-left), so adjust pointers
+ add rd,rd,rc
+ b LMoveUpShort // move up in memory and return
+
+// Handle long moves (>32 bytes.)
+
+Lbm1:
+ beqlr cr6 // rs==rd, so nothing to move
+ bltlr cr0 // length<0, so ignore call and return
+ mflr r12 // save return address
+ bge cr6,Lbm2 // rd>=rs, so move up
+
+// Long moves down (left-to-right.)
+
+ neg r6,rd // start to 32-byte-align destination
+ andi. r0,r6,0x1F // r0 <- bytes to move to align destination
+ bnel LMoveDownShort // align destination if necessary
+ bl LMoveDownLong // move 32-byte chunks down
+ andi. r0,rc,0x1F // done?
+ mtlr r12 // restore caller's return address
+ bne LMoveDownShort // move trailing leftover bytes and done
+ blr // no leftovers, so done
+
+// Long moves up (right-to-left.)
+
+Lbm2:
+ add rs,rs,rc // moving up (right-to-left), so adjust pointers
+ add rd,rd,rc
+ andi. r0,rd,0x1F // r0 <- bytes to move to align destination
+ bnel LMoveUpShort // align destination if necessary
+ bl LMoveUpLong // move 32-byte chunks up
+ andi. r0,rc,0x1F // done?
+ mtlr r12 // restore caller's return address
+ bne LMoveUpShort // move trailing leftover bytes and done
+ blr // no leftovers, so done
+
+// ***************
+// * M O V E 3 2 *
+// ***************
+//
+// Special case subroutine to move a 32-byte block. MoveDownShort and
+// MoveUpShort only handle 0..31 bytes, and we believe 32 bytes is too
+// common a case to send it through the general purpose long-block code.
+// Since it moves both up and down, we must load all 32 bytes before
+// storing any.
+//
+// Calling Sequence: rs = source ptr
+// rd = destination ptr
+//
+// Uses: r0,r5-r11.
+//
+
+LMove32:
+ lwz r0,0(rs)
+ lwz r5,4(rs)
+ lwz r6,8(rs)
+ lwz r7,12(rs)
+ lwz r8,16(rs)
+ lwz r9,20(rs)
+ lwz r10,24(rs)
+ lwz r11,28(rs)
+ stw r0,0(rd)
+ stw r5,4(rd)
+ stw r6,8(rd)
+ stw r7,12(rd)
+ stw r8,16(rd)
+ stw r9,20(rd)
+ stw r10,24(rd)
+ stw r11,28(rd)
+ blr
+
+
+// *************************
+// * M o v e U p S h o r t *
+// *************************
+//
+// Subroutine called to move <32 bytes up in memory (ie, right-to-left).
+//
+// Entry conditions: rs = last byte moved from source (right-to-left)
+// rd = last byte moved into destination
+// r0 = #bytes to move (0..31)
+//
+// Exit conditions: rs = updated source ptr
+// rd = updated destination ptr
+// rc = decremented by #bytes moved
+//
+// Uses: r0,r6,r7,r8,cr7.
+//
+
+LMoveUpShort:
+ andi. r6,r0,0x10 // test 0x10 bit in length
+ mtcrf 0x1,r0 // move count to cr7 so we can test bits
+ sub rc,rc,r0 // decrement count of bytes remaining to be moved
+ beq Lmus1 // skip if 0x10 bit in length is 0
+ lwzu r0,-16(rs) // set, so copy up 16 bytes
+ lwz r6,4(rs)
+ lwz r7,8(rs)
+ lwz r8,12(rs)
+ stwu r0,-16(rd)
+ stw r6,4(rd)
+ stw r7,8(rd)
+ stw r8,12(rd)
+
+Lmus1:
+ bf 28,Lmus2 // test 0x08 bit
+ lwzu r0,-8(rs)
+ lwz r6,4(rs)
+ stwu r0,-8(rd)
+ stw r6,4(rd)
+
+Lmus2:
+ bf 29,Lmus3 // test 0x4 bit
+ lwzu r0,-4(rs)
+ stwu r0,-4(rd)
+
+Lmus3:
+ bf 30,Lmus4 // test 0x2 bit
+ lhzu r0,-2(rs)
+ sthu r0,-2(rd)
+
+Lmus4:
+ bflr 31 // test 0x1 bit, return if 0
+ lbzu r0,-1(rs)
+ stbu r0,-1(rd)
+ blr
+
+
+// *****************************
+// * M o v e D o w n S h o r t *
+// *****************************
+//
+// Subroutine called to move <32 bytes down in memory (ie, left-to-right).
+//
+// Entry conditions: rs = source pointer
+// rd = destination pointer
+// r0 = #bytes to move (0..31)
+//
+// Exit conditions: rs = ptr to 1st byte not moved
+// rd = ptr to 1st byte not moved
+// rc = decremented by #bytes moved
+//
+// Uses: r0,r6,r7,r8,cr7.
+//
+
+LMoveDownShort:
+ andi. r6,r0,0x10 // test 0x10 bit in length
+ mtcrf 0x1,r0 // move count to cr7 so we can test bits
+ sub rc,rc,r0 // decrement count of bytes remaining to be moved
+ beq Lmds1 // skip if 0x10 bit in length is 0
+ lwz r0,0(rs) // set, so copy up 16 bytes
+ lwz r6,4(rs)
+ lwz r7,8(rs)
+ lwz r8,12(rs)
+ addi rs,rs,16
+ stw r0,0(rd)
+ stw r6,4(rd)
+ stw r7,8(rd)
+ stw r8,12(rd)
+ addi rd,rd,16
+
+Lmds1:
+ bf 28,Lmds2 // test 0x08 bit
+ lwz r0,0(rs)
+ lwz r6,4(rs)
+ addi rs,rs,8
+ stw r0,0(rd)
+ stw r6,4(rd)
+ addi rd,rd,8
+
+Lmds2:
+ bf 29,Lmds3 // test 0x4 bit
+ lwz r0,0(rs)
+ addi rs,rs,4
+ stw r0,0(rd)
+ addi rd,rd,4
+
+Lmds3:
+ bf 30,Lmds4 // test 0x2 bit
+ lhz r0,0(rs)
+ addi rs,rs,2
+ sth r0,0(rd)
+ addi rd,rd,2
+
+Lmds4:
+ bflr 31 // test 0x1 bit, return if 0
+ lbz r0,0(rs)
+ addi rs,rs,1
+ stb r0,0(rd)
+ addi rd,rd,1
+ blr
+
+
+// ***********************
+// * M o v e U p L o n g *
+// ***********************
+//
+// Subroutine to move 32-byte chunks of memory up (ie, right-to-left.)
+// The destination is known to be 32-byte aligned, but the source is
+// *not* necessarily aligned.
+//
+// Entry conditions: rs = last byte moved from source (right-to-left)
+// rd = last byte moved into destination
+// rc = count of bytes to move
+// cr = crCached set iff destination is cacheable
+//
+// Exit conditions: rs = updated source ptr
+// rd = updated destination ptr
+// rc = low order 8 bits of count of bytes to move
+//
+// Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
+//
+
+LMoveUpLong:
+ srwi. r11,rc,5 // r11 <- #32 byte chunks to move
+ mtctr r11 // prepare loop count
+ beqlr // return if no chunks to move
+ andi. r0,rs,7 // is source at least doubleword aligned?
+ beq Lmup3 // yes, can optimize this case
+ mtcrf 0x1,rc // save low bits of count
+ mtcrf 0x2,rc // (one cr at a time, as 604 prefers)
+
+Lmup1: // loop over each 32-byte-chunk
+ lwzu r0,-32(rs)
+ subi rd,rd,32 // prepare destination address for 'dcbz'
+ lwz r5,4(rs)
+ lwz r6,8(rs)
+ lwz r7,12(rs)
+ lwz r8,16(rs)
+ lwz r9,20(rs)
+ lwz r10,24(rs)
+ lwz r11,28(rs)
+ stw r0,0(rd)
+ stw r5,4(rd)
+ stw r6,8(rd)
+ stw r7,12(rd)
+ stw r8,16(rd)
+ stw r9,20(rd)
+ stw r10,24(rd)
+ stw r11,28(rd)
+ bdnz Lmup1
+ mfcr rc // restore low bits of count
+ blr // return to caller
+
+// Aligned operands, so use d.p. floating point registers to move data.
+
+Lmup3:
+ lfdu f0,-32(rs)
+ subi rd,rd,32 // prepare destination address for 'dcbz'
+ lfd f1,8(rs)
+ lfd f2,16(rs)
+ lfd f3,24(rs)
+ stfd f0,0(rd)
+ stfd f1,8(rd)
+ stfd f2,16(rd)
+ stfd f3,24(rd)
+ bdnz Lmup3
+ blr // return to caller
+
+
+// ***************************
+// * M o v e D o w n L o n g *
+// ***************************
+//
+// Subroutine to move 32-byte chunks of memory down (ie, left-to-right.)
+// The destination is known to be 32-byte aligned, but the source is
+// *not* necessarily aligned.
+//
+// Entry conditions: rs = source ptr (next byte to move)
+// rd = dest ptr (next byte to move into)
+// rc = count of bytes to move
+// cr = crCached set iff destination is cacheable
+//
+// Exit conditions: rs = updated source ptr
+// rd = updated destination ptr
+// rc = low order 8 bits of count of bytes to move
+//
+// Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
+//
+
+LMoveDownLong:
+ srwi. r11,rc,5 // r11 <- #32 byte chunks to move
+ mtctr r11 // prepare loop count
+ beqlr // return if no chunks to move
+ andi. r0,rs,7 // is source at least doubleword aligned?
+ beq Lmdown3 // yes, can optimize this case
+ mtcrf 0x1,rc // save low 8 bits of count
+ mtcrf 0x2,rc // (one cr at a time, as 604 prefers)
+
+Lmdown1: // loop over each 32-byte-chunk
+ lwz r0,0(rs)
+ lwz r5,4(rs)
+ lwz r6,8(rs)
+ lwz r7,12(rs)
+ lwz r8,16(rs)
+ lwz r9,20(rs)
+ lwz r10,24(rs)
+ lwz r11,28(rs)
+ stw r0,0(rd)
+ stw r5,4(rd)
+ stw r6,8(rd)
+ stw r7,12(rd)
+ stw r8,16(rd)
+ stw r9,20(rd)
+ addi rs,rs,32
+ stw r10,24(rd)
+ stw r11,28(rd)
+ addi rd,rd,32
+ bdnz Lmdown1
+ mfcr rc // restore low bits of count
+ blr // return to caller
+
+// Aligned operands, so use d.p. floating point registers to move data.
+
+Lmdown3:
+ lfd f0,0(rs)
+ lfd f1,8(rs)
+ lfd f2,16(rs)
+ lfd f3,24(rs)
+ addi rs,rs,32
+ stfd f0,0(rd)
+ stfd f1,8(rd)
+ stfd f2,16(rd)
+ stfd f3,24(rd)
+ addi rd,rd,32
+ bdnz Lmdown3
+ blr // return to caller
+
+//
+// Register use conventions are as follows:
+//
+// r0 - temp
+// r6 - copy of VMX SPR at entry
+// r7 - temp
+// r8 - constant -1 (also temp and a string op buffer)
+// r9 - constant 16 or -17 (also temp and a string op buffer)
+// r10- constant 32 or -33 (also temp and a string op buffer)
+// r11- constant 48 or -49 (also temp and a string op buffer)
+// r12- chunk count ("c") in long moves
+//
+// v0 - vp - permute vector
+// v1 - va - 1st quadword of source
+// v2 - vb - 2nd quadword of source
+// v3 - vc - 3rd quadword of source
+// v4 - vd - 4th quadword of source
+// v5 - vx - temp
+// v6 - vy - temp
+// v7 - vz - temp
+
+#define vp v0
+#define va v1
+#define vb v2
+#define vc v3
+#define vd v4
+#define vx v5
+#define vy v6
+#define vz v7
+
+#define VRSave 256
+
+// kShort should be the crossover point where the long algorithm is faster than the short.
+// WARNING: kShort must be >= 64
+
+// Yes, I know, we just checked rc > 128 to get here...
+
+#define kShort 128
+LAltivec:
+ cmpwi cr1,rc,kShort //(1) too short to bother using vector regs?
+ sub. r0,rd,rs //(1) must move reverse if (rd-rs)<rc
+ dcbt 0,rs //(2) prefetch first source block
+ cmplw cr6,r0,rc //(2) set cr6 blt iff we must move reverse
+ beqlr- //(2) done if src==dest
+ srawi. r9,rc,4 //(3) r9 <- quadwords to move, test for zero
+ or r8,rs,rd //(3) start to check for word alignment
+ dcbtst 0,rd //(4) prefetch first destination block
+ rlwinm r8,r8,0,30,31 //(4) r8 is zero if word aligned
+ bgt- cr1,LMoveLong //(4) handle long operands
+ cmpwi cr1,r8,0 //(5) word aligned?
+ rlwinm r7,rc,0,28,31 //(5) r7 <- leftover bytes to move after quadwords
+ bltlr- //(5) done if negative count
+ blt- cr6,LShortReverse //(5) handle reverse moves
+ cmpwi cr7,r7,0 //(6) leftover bytes?
+ beq- Leftovers //(6) r9==0, so no quadwords to move
+ mtctr r9 //(7) set up for quadword loop
+ bne- cr1,LUnalignedLoop //(7) not word aligned (less common than word aligned)
+
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> S H O R T O P E R A N D S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+LAlignedLoop: // word aligned operands (the common case)
+ lfd f0,0(rs) //(1)
+ lfd f1,8(rs) //(2)
+ addi rs,rs,16 //(2)
+ stfd f0,0(rd) //(3)
+ stfd f1,8(rd) //(4)
+ addi rd,rd,16 //(4)
+ bdnz LAlignedLoop //(4)
+
+Leftovers:
+ beqlr- cr7 //(8) done if r7==0, ie no leftover bytes
+ mtxer r7 //(9) count of bytes to move (1-15)
+ lswx r8,0,rs
+ stswx r8,0,rd
+ blr //(17)
+
+LUnalignedLoop: // not word aligned, cannot use lfd/stfd
+ lwz r8,0(rs) //(1)
+ lwz r9,4(rs) //(2)
+ lwz r10,8(rs) //(3)
+ lwz r11,12(rs) //(4)
+ addi rs,rs,16 //(4)
+ stw r8,0(rd) //(5)
+ stw r9,4(rd) //(6)
+ stw r10,8(rd) //(7)
+ stw r11,12(rd) //(8)
+ addi rd,rd,16 //(8)
+ bdnz LUnalignedLoop //(8)
+
+ b Leftovers
+
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> S H O R T R E V E R S E M O V E S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+ // cr0 & r9 <- #doublewords to move (>=0)
+ // cr1 <- beq if word aligned
+ // r7 <- #leftover bytes to move (0-15)
+
+LShortReverse:
+ cmpwi cr7,r7,0 // leftover bytes?
+ add rs,rs,rc // point 1 past end of string for reverse moves
+ add rd,rd,rc
+ beq- LeftoversReverse // r9==0, ie no words to move
+ mtctr r9 // set up for quadword loop
+ bne- cr1,LUnalignedLoopReverse
+
+LAlignedLoopReverse: // word aligned, so use lfd/stfd
+ lfd f0,-8(rs)
+ lfdu f1,-16(rs)
+ stfd f0,-8(rd)
+ stfdu f1,-16(rd)
+ bdnz LAlignedLoopReverse
+
+LeftoversReverse:
+ beqlr- cr7 // done if r7==0, ie no leftover bytes
+ mtxer r7 // count of bytes to move (1-15)
+ neg r7,r7 // index back by #bytes
+ lswx r8,r7,rs
+ stswx r8,r7,rd
+ blr
+
+LUnalignedLoopReverse: // not word aligned, cannot use lfd/stfd
+ lwz r8,-4(rs)
+ lwz r9,-8(rs)
+ lwz r10,-12(rs)
+ lwzu r11,-16(rs)
+ stw r8,-4(rd)
+ stw r9,-8(rd)
+ stw r10,-12(rd)
+ stwu r11,-16(rd)
+ bdnz LUnalignedLoopReverse
+
+ b LeftoversReverse
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> L O N G O P E R A N D S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+ // cr6 set (blt) if must move reverse
+ // r0 <- (rd - rs)
+
+LMoveLong:
+ mfspr r6,VRSave //(5) save caller's VMX mask register
+ stw r6,-4(r1) // use CR save area so we can use r6 later
+ neg r8,rd //(5) start to compute #bytes to fill in 1st dest quadword
+ rlwinm r0,r0,0,28,31 //(6) start to determine relative alignment
+ andi. r7,r8,0xF //(6) r7 <- #bytes to fill in 1st dest quadword
+ cmpwi cr7,r0,0 //(7) relatively aligned? (ie, 16 bytes apart?)
+ oris r9,r6,0xFF00 //(7) light bits for regs we use (v0-v7)
+ mtspr VRSave,r9 //(8) update live register bitmask
+ blt- cr6,LongReverse //(8) must move reverse direction
+ sub rc,rc,r7 //(9) adjust length while we wait
+ beq- LDest16Aligned //(9) r7==0, ie destination already quadword aligned
+
+ // Align destination on a quadword.
+
+ mtxer r7 //(10) set up byte count (1-15)
+ lswx r8,0,rs // load into r8-r11
+ stswx r8,0,rd // store r8-r11 (measured latency on arthur is 7.2 cycles)
+ add rd,rd,r7 //(18) adjust ptrs
+ add rs,rs,r7 //(18)
+
+ // Begin preparation for inner loop and "dst" stream.
+
+LDest16Aligned:
+ andi. r0,rd,0x10 //(19) is destination cache-block aligned?
+ li r9,16 //(19) r9 <- constant used to access 2nd quadword
+ li r10,32 //(20) r10<- constant used to access 3rd quadword
+ beq- cr7,LAligned //(20) handle relatively aligned operands
+ lvx va,0,rs //(20) prefetch 1st source quadword
+ li r11,48 //(21) r11<- constant used to access 4th quadword
+ lvsl vp,0,rs //(21) get permute vector to left shift
+ beq LDest32Aligned //(22) destination already cache-block aligned
+
+ // Copy 16 bytes to align destination on 32-byte (cache block) boundary
+ // to maximize store gathering.
+
+ lvx vb,r9,rs //(23) get 2nd source qw
+ subi rc,rc,16 //(23) adjust count
+ addi rs,rs,16 //(24) adjust source ptr
+ vperm vx,va,vb,vp //(25) vx <- 1st destination qw
+ vor va,vb,vb //(25) va <- vb
+ stvx vx,0,rd //(26) assuming store Q deep enough to avoid latency
+ addi rd,rd,16 //(26) adjust dest ptr
+
+ // Destination 32-byte aligned, source alignment unknown.
+
+LDest32Aligned:
+ srwi. r12,rc,6 //(27) r12<- count of 64-byte chunks to move
+ rlwinm r7,rc,28,30,31 //(27) r7 <- count of 16-byte chunks to move
+ cmpwi cr1,r7,0 //(28) remember if any 16-byte chunks
+ rlwinm r8,r12,0,26,31 //(29) mask chunk count down to 0-63
+ subi r0,r8,1 //(30) r8==0?
+ beq- LNoChunks //(30) r12==0, ie no chunks to move
+ rlwimi r8,r0,0,25,25 //(31) if r8==0, then r8 <- 64
+ li r0,64 //(31) r0 <- used to get 1st quadword of next chunk
+ sub. r12,r12,r8 //(32) adjust chunk count, set cr0
+ mtctr r8 //(32) set up loop count
+ li r8,96 //SKP
+ li r6,128 //SKP
+ // Inner loop for unaligned sources. We copy 64 bytes per iteration.
+ // We loop at most 64 times, then reprime the "dst" and loop again for
+ // the next 4KB. This loop is tuned to keep the CPU flat out, which
+ // means we need to execute a lvx or stvx every cycle.
+
+LoopBy64:
+ dcbt rs,r8 //SKP
+ dcbt rs,r6 //SKP
+ lvx vb,r9,rs //(1) 2nd source quadword (1st already in va)
+ lvx vc,r10,rs //(2) 3rd
+ lvx vd,r11,rs //(3) 4th
+ vperm vx,va,vb,vp //(3) vx <- 1st destination quadword
+ lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (r0 must be RB!)
+ vperm vy,vb,vc,vp //(4) vy <- 2nd dest qw
+ stvx vx,0,rd //(5)
+ vperm vz,vc,vd,vp //(5) vz <- 3rd dest qw
+ stvx vy,r9,rd //(6)
+ vperm vx,vd,va,vp //(6) vx <- 4th
+ stvx vz,r10,rd //(7)
+ addi rs,rs,64 //(7)
+ stvx vx,r11,rd //(8)
+ addi rd,rd,64 //(8)
+ bdnz LoopBy64 //(8)
+
+ // End of inner loop. Should we reprime dst stream and restart loop?
+ // This block is only executed when we're moving more than 4KB.
+ // It is usually folded out because cr0 is set in the loop prologue.
+
+ beq+ LNoChunks // r12==0, ie no more chunks to move
+ sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
+ mtctr r0 // initialize loop count to 64
+ b LoopBy64 // restart inner loop, xfer another 4KB
+
+ // Fewer than 64 bytes remain to be moved.
+
+LNoChunks: // r7 and cr1 are set with the number of QWs
+ andi. rc,rc,0xF //(33) rc <- leftover bytes
+ beq- cr1,LCleanup //(33) r7==0, ie fewer than 16 bytes remaining
+ mtctr r7 //(34) we will loop over 1-3 QWs
+
+LoopBy16:
+ lvx vb,r9,rs //(1) vb <- 2nd source quadword
+ addi rs,rs,16 //(1)
+ vperm vx,va,vb,vp //(3) vx <- next destination quadword
+ vor va,vb,vb //(3) va <- vb
+ stvx vx,0,rd //(4) assuming store Q is deep enough to mask latency
+ addi rd,rd,16 //(4)
+ bdnz LoopBy16 //(4)
+
+ // Move remaining bytes in last quadword. rc and cr0 have the count.
+
+LCleanup:
+ lwz r6,-4(r1) // load VRSave from CR save area
+ mtspr VRSave,r6 //(35) restore caller's live-register bitmask
+ beqlr //(36) rc==0, ie no leftovers, so done
+ mtxer rc //(37) load byte count (1-15)
+ lswx r8,0,rs
+ stswx r8,0,rd
+ blr //(45)
+
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> L O N G A L I G N E D M O V E S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+ // rs, rd <- both quadword aligned
+ // cr0 <- beq if dest is cache block (32-byte) aligned
+ // r9 <- 16
+ // r10 <- 32
+
+LAligned:
+ lvx va,0,rs // prefetch 1st source quadword
+ li r11,48 // r11<- constant used to access 4th quadword
+ beq LAligned32 // destination already cache-block aligned
+
+ // Copy 16 bytes to align destination on 32-byte (cache block) boundary
+ // to maximize store gathering.
+
+ subi rc,rc,16 // adjust count
+ addi rs,rs,16 // adjust source ptr
+ stvx va,0,rd // assuming store Q deep enough to avoid latency
+ addi rd,rd,16 // adjust dest ptr
+
+ // Destination 32-byte aligned, source 16-byte aligned. Set up for inner loop.
+
+LAligned32:
+ srwi. r12,rc,6 // r12<- count of 64-byte chunks to move
+ rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move
+ cmpwi cr1,r7,0 // remember if any 16-byte chunks
+ rlwinm r8,r12,0,26,31 // mask chunk count down to 0-63
+ subi r0,r8,1 // r8==0?
+ beq- LAlignedNoChunks // r12==0, ie no chunks to move
+ rlwimi r8,r0,0,25,25 // if r8==0, then r8 <- 64
+ li r0,64 // r0 <- used at end of loop
+ sub. r12,r12,r8 // adjust chunk count, set cr0
+ mtctr r8 // set up loop count
+ li r8,96 //SKP
+ li r6,128 //SKP
+
+ // Inner loop for aligned sources. We copy 64 bytes per iteration.
+
+LAlignedLoopBy64:
+ dcbt rs,r8 //SKP
+ dcbt rs,r6 //SKP
+ lvx va,0,rs //(1)
+ lvx vb,r9,rs //(2)
+ lvx vc,r10,rs //(3)
+ lvx vd,r11,rs //(4)
+ addi rs,rs,64 //(4)
+ stvx va,0,rd //(5)
+ stvx vb,r9,rd //(6)
+ stvx vc,r10,rd //(7)
+ stvx vd,r11,rd //(8)
+ addi rd,rd,64 //(8)
+ bdnz LAlignedLoopBy64 //(8)
+
+ // End of inner loop. Loop again for next 4KB iff any.
+
+ beq+ LAlignedNoChunks // r12==0, ie no more chunks to move
+ sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
+ mtctr r0 // reinitialize loop count to 64
+ b LAlignedLoopBy64 // restart inner loop, xfer another 4KB
+
+ // Fewer than 64 bytes remain to be moved.
+
+LAlignedNoChunks: // r7 and cr1 are set with the number of QWs
+ andi. rc,rc,0xF // rc <- leftover bytes
+ beq- cr1,LCleanup // r7==0, ie fewer than 16 bytes remaining
+ mtctr r7 // we will loop over 1-3 QWs
+
+LAlignedLoopBy16:
+ lvx va,0,rs // get next quadword
+ addi rs,rs,16
+ stvx va,0,rd
+ addi rd,rd,16
+ bdnz LAlignedLoopBy16
+
+ b LCleanup // handle last 0-15 bytes, if any
+
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> L O N G R E V E R S E M O V E S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+ // Reverse moves. These involve overlapping operands, with the source
+ // lower in memory (lower addresses) than the destination. They must be
+ // done right-to-left, ie from high addresses down to low addresses.
+ // Throughout this code, we maintain rs and rd as pointers one byte past
+ // the end of the untransferred operands.
+ //
+ // The byte count is >=kShort and the following registers are already loaded:
+ //
+ // r6 - VMX mask at entry
+ // cr7 - beq if relatively aligned
+ //
+
+LongReverse:
+ add rd,rd,rc // update source/dest ptrs to be 1 byte past end
+ add rs,rs,rc
+ andi. r7,rd,0xF // r7 <- #bytes needed to move to align destination
+ sub rc,rc,r7 // adjust length while we wait
+ sub rs,rs,r7 // adjust ptrs by #bytes to xfer, also while we wait
+ sub rd,rd,r7
+ beq- LDest16AlignedReverse
+
+ // Align destination on a quadword. Note that we do NOT align on a cache
+ // block boundary for store gathering etc// since all these operands overlap
+ // many dest cache blocks will already be in the L1, so its not clear that
+ // this would be a win.
+
+ mtxer r7 // load byte count
+ lswx r8,0,rs
+ stswx r8,0,rd
+
+ // Prepare for inner loop and start "dstst" stream. Frankly, its not
+ // clear whether "dst" or "dstst" would be better// somebody should
+ // measure. We use "dstst" because, being overlapped, at least some
+ // source cache blocks will also be stored into.
+
+LDest16AlignedReverse:
+ srwi. r12,rc,6 // r12 <- count of 64-byte chunks to move
+ rlwinm r0,rc,11,9,15 // position quadword count for dst
+ rlwinm r11,r12,0,26,31 // mask chunk count down to 0-63
+ li r9,-17 // r9 <- constant used to access 2nd quadword
+ oris r0,r0,0x0100 // set dst block size to 1 qw
+ li r10,-33 // r10<- constant used to access 3rd quadword
+ ori r0,r0,0xFFE0 // set dst stride to -16 bytes
+ li r8,-1 // r8<- constant used to access 1st quadword
+ dstst rs,r0,3 // start stream 0
+ subi r0,r11,1 // r11==0 ?
+ lvx va,r8,rs // prefetch 1st source quadword
+ rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move
+ lvsl vp,0,rs // get permute vector to right shift
+ cmpwi cr1,r7,0 // remember if any 16-byte chunks
+ beq- LNoChunksReverse // r12==0, so skip inner loop
+ rlwimi r11,r0,0,25,25 // if r11==0, then r11 <- 64
+ sub. r12,r12,r11 // adjust chunk count, set cr0
+ mtctr r11 // set up loop count
+ li r11,-49 // r11<- constant used to access 4th quadword
+ li r0,-64 // r0 <- used for several purposes
+ beq- cr7,LAlignedLoopBy64Reverse
+
+ // Inner loop for unaligned sources. We copy 64 bytes per iteration.
+
+LoopBy64Reverse:
+ lvx vb,r9,rs //(1) 2nd source quadword (1st already in va)
+ lvx vc,r10,rs //(2) 3rd quadword
+ lvx vd,r11,rs //(3) 4th
+ vperm vx,vb,va,vp //(3) vx <- 1st destination quadword
+ lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (note r0 must be RB)
+ vperm vy,vc,vb,vp //(4) vy <- 2nd dest qw
+ stvx vx,r8,rd //(5)
+ vperm vz,vd,vc,vp //(5) vz <- 3rd destination quadword
+ stvx vy,r9,rd //(6)
+ vperm vx,va,vd,vp //(6) vx <- 4th qw
+ stvx vz,r10,rd //(7)
+ subi rs,rs,64 //(7)
+ stvx vx,r11,rd //(8)
+ subi rd,rd,64 //(8)
+ bdnz LoopBy64Reverse //(8)
+
+ // End of inner loop. Should we reprime dst stream and restart loop?
+ // This block is only executed when we're moving more than 4KB.
+ // It is usually folded out because cr0 is set in the loop prologue.
+
+ beq+ LNoChunksReverse // r12==0, ie no more chunks to move
+ lis r8,0x0440 // dst control: 64 4-qw blocks
+ add. r12,r12,r0 // set cr0 if more than 4KB remain to xfer
+ ori r8,r8,0xFFC0 // stride is -64 bytes
+ dstst rs,r8,3 // restart the prefetch stream
+ li r8,64 // inner loop count
+ mtctr r8 // initialize loop count to 64
+ li r8,-1 // restore qw1 offset for inner loop
+ b LoopBy64Reverse // restart inner loop, xfer another 4KB
+
+ // Fewer than 64 bytes remain to be moved.
+
+LNoChunksReverse: // r7 and cr1 are set with the number of QWs
+ andi. rc,rc,0xF // rc <- leftover bytes
+ beq- cr1,LCleanupReverse // r7==0, ie fewer than 16 bytes left
+ mtctr r7
+ beq- cr7,LAlignedLoopBy16Reverse
+
+LoopBy16Reverse:
+ lvx vb,r9,rs // vb <- 2nd source quadword
+ subi rs,rs,16
+ vperm vx,vb,va,vp // vx <- next destination quadword
+ vor va,vb,vb // va <- vb
+ stvx vx,r8,rd
+ subi rd,rd,16
+ bdnz LoopBy16Reverse
+
+ // Fewer that 16 bytes remain to be moved.
+
+LCleanupReverse: // rc and cr0 set with remaining byte count
+ lwz r6,-4(r1) // load VRSave from CR save area
+ mtspr VRSave,r6 // restore caller's live-register bitmask
+ beqlr // rc==0, ie no leftovers so done
+ neg r7,rc // get -(#bytes)
+ mtxer rc // byte count
+ lswx r8,r7,rs
+ stswx r8,r7,rd
+ blr
+
+
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// <><> A L I G N E D L O N G R E V E R S E M O V E S <><>
+// <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+ // Inner loop. We copy 64 bytes per iteration.
+
+LAlignedLoopBy64Reverse:
+ lvx va,r8,rs //(1)
+ lvx vb,r9,rs //(2)
+ lvx vc,r10,rs //(3)
+ lvx vd,r11,rs //(4)
+ subi rs,rs,64 //(4)
+ stvx va,r8,rd //(5)
+ stvx vb,r9,rd //(6)
+ stvx vc,r10,rd //(7)
+ stvx vd,r11,rd //(8)
+ subi rd,rd,64 //(8)
+ bdnz LAlignedLoopBy64Reverse //(8)
+
+ // End of inner loop. Loop for next 4KB iff any.
+
+ beq+ LNoChunksReverse // r12==0, ie no more chunks to move
+ lis r8,0x0440 // dst control: 64 4-qw blocks
+ add. r12,r12,r0 // r12 <- r12 - 64, set cr0
+ ori r8,r8,0xFFC0 // stride is -64 bytes
+ dstst rs,r8,3 // restart the prefetch stream
+ li r8,64 // inner loop count
+ mtctr r8 // initialize loop count to 64
+ li r8,-1 // restore qw1 offset for inner loop
+ b LAlignedLoopBy64Reverse
+
+ // Loop to copy leftover quadwords (1-3).
+
+LAlignedLoopBy16Reverse:
+ lvx va,r8,rs // get next qw
+ subi rs,rs,16
+ stvx va,r8,rd
+ subi rd,rd,16
+ bdnz LAlignedLoopBy16Reverse
+
+ b LCleanupReverse // handle up to 15 bytes in last qw
+++ /dev/null
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- *
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License"). You may not use this file except in compliance with the
- * License. Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- *
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- *
- * @APPLE_LICENSE_HEADER_END@
- */
-#define MEMCPY
-#include "bcopy.s"
+++ /dev/null
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- *
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License"). You may not use this file except in compliance with the
- * License. Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- *
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- *
- * @APPLE_LICENSE_HEADER_END@
- */
-#define MEMMOVE
-#include "bcopy.s"
#define PROTECT_SMALL 0 // Should be 0: 1 is too slow for normal use
-#define LARGE_CACHE_SIZE 4 // define hysterisis of large chunks
+#define LARGE_CACHE_SIZE 1 // define hysterisis of large chunks
+#define MAX_LARGE_SIZE_TO_CACHE (128*1024) /* blocks larger than this are not cached */
#define MAX_RECORDER_BUFFER 256
static boolean_t szone_check_all(szone_t *szone, const char *function);
static void szone_print(szone_t *szone, boolean_t verbose);
static INLINE region_t *region_for_ptr_no_lock(szone_t *szone, const void *ptr);
+static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry);
#define LOG(szone,ptr) (szone->log_address && (szone->num_small_objects > 8) && (((unsigned)szone->log_address == -1) || (szone->log_address == (void *)(ptr))))
}
static vm_range_t large_free_no_lock(szone_t *szone, large_entry_t *entry) {
- // enters the specified large entry into the cache of freed entries
- // returns a range to truly deallocate
- vm_range_t vm_range_to_deallocate;
+ // frees the specific entry in the size table
+ // returns a range to truly deallocate, taking into account
vm_range_t range;
- vm_range_t *range_to_use;
range.address = LARGE_ENTRY_ADDRESS(*entry);
range.size = LARGE_ENTRY_SIZE(*entry);
szone->num_large_objects_in_use --;
sleep(3600);
}
#endif
+ return range;
+}
+
+static vm_range_t large_find_better_range_to_deallocate(szone_t *szone, vm_range_t range) {
+ // enters the specified large entry into the cache of freed entries
+ // returns a range to truly deallocate
+ vm_range_t *range_to_use;
+ vm_range_t vm_range_to_deallocate;
+
+ // if the specified range in larger than MAX_LARGE_SIZE_TO_CACHE the range is not cached
+ if (range.size > MAX_LARGE_SIZE_TO_CACHE) return range;
+
range = coalesce_range(szone->large_to_deallocate, LARGE_CACHE_SIZE, range);
range_to_use = first_zero_range(szone->large_to_deallocate, LARGE_CACHE_SIZE);
if (range_to_use) {
vm_msync(mach_task_self(), LARGE_ENTRY_ADDRESS(*entry), LARGE_ENTRY_SIZE(*entry), VM_SYNC_KILLPAGES);
}
vm_range_to_deallocate = large_free_no_lock(szone, entry);
+ vm_range_to_deallocate = large_find_better_range_to_deallocate(szone, vm_range_to_deallocate);
#if DEBUG_MALLOC
if (large_entry_for_pointer_no_lock(szone, ptr)) {
malloc_printf("*** malloc[%d]: Just after freeing 0x%x still in use num_large_entries=%d\n", getpid(), ptr, szone->num_large_entries);
if (szone_try_realloc_in_place(szone, ptr, old_size, new_size)) return ptr;
}
newPtr = szone_malloc(szone, new_size);
- if (old_size > VM_COPY_THRESHOLD) {
+ if ((old_size > VM_COPY_THRESHOLD) && (old_size < (1 << (vm_page_shift + vm_page_shift)))) {
+ // we know it's a large block, and not a huge block
kern_return_t err = 0;
err = vm_copy(mach_task_self(), (vm_address_t)ptr, old_size, (vm_address_t)newPtr);
if (err) {
szone_error(szone, "Can't vm_copy region", ptr);
- }
+ } else {
+ large_entry_t *entry;
+ vm_range_t range;
+ SZONE_LOCK(szone);
+ entry = large_entry_for_pointer_no_lock(szone, ptr);
+ if (!entry) {
+ szone_error(szone, "Can't find entry for large copied block", ptr);
+ }
+ range = large_free_no_lock(szone, entry);
+ SZONE_UNLOCK(szone); // we release the lock asap
+ // we truly deallocate_pages, including guard pages
+ deallocate_pages(szone, range.address, range.size, 0);
+ if (LOG(szone, ptr)) malloc_printf("szone_realloc returned %p for %d\n", newPtr, (unsigned)new_size);
+ return newPtr;
+ }
} else {
memcpy(newPtr, ptr, old_size);
}
return(0);
}
- if (!PathLocale && !(PathLocale = getenv("PATH_LOCALE")))
+ if (!PathLocale)
PathLocale = _PATH_LOCALE;
sprintf(name, "%s/%s/LC_CTYPE", PathLocale, encoding);
int found, i, len;
char *env, *r;
- if (!PathLocale && !(PathLocale = getenv("PATH_LOCALE")))
+ if (!PathLocale)
PathLocale = _PATH_LOCALE;
if (category < 0 || category >= _LC_LAST)
_atfork_child_routine = mach_atfork_child_routine;
_pthread_set_self(0);
cthread_set_self(0);
- }
+ }
/*
* Initialize the single mig reply port
mach_port_t
mach_task_self()
{
- return(mach_task_self_);
+ return(task_self_trap());
}
mach_port_t
mach_thread_self()
{
return(thread_self_trap());
-}
\ No newline at end of file
+}
*/
size_t _pthread_stack_size = 0;
-int _spin_tries = 1;
+int _spin_tries = 0;
+#if !defined(__ppc__)
int _cpu_has_altivec = 0;
+#endif
/* This global should be used (carefully) by anyone needing to know if a pthread has been
** created.
#endif
-/* This is the struct used to recycle (or terminate) a thread */
-/* We stash the thread port into the reply port of the message */
-
-typedef struct {
- mach_msg_header_t header;
- mach_msg_trailer_t trailer;
-} recycle_msg_t;
-
/* Set the base address to use as the stack pointer, before adjusting due to the ABI */
static int
}
}
-pthread_t _cachedThread = (pthread_t)0;
-
-void _clear_thread_cache(void) {
- _cachedThread = (pthread_t)0;
-}
-
/*
* Create and start execution of a new thread.
*/
static void
_pthread_body(pthread_t self)
{
- _clear_thread_cache();
_pthread_set_self(self);
pthread_exit((self->fun)(self->arg));
}
thread->death = MACH_PORT_NULL;
UNLOCK(thread->lock);
if (num_joiners > 0)
- { /* Have to tell these guys this thread can't be joined with */
- swtch_pri(0);
- PTHREAD_MACH_CALL(semaphore_signal_all(thread->joiners), kern_res);
+ {
+ /* Wake up a joiner */
+ PTHREAD_MACH_CALL(semaphore_signal(thread->joiners), kern_res);
}
/* Destroy 'control' semaphores */
PTHREAD_MACH_CALL(semaphore_destroy(mach_task_self(),
PTHREAD_MACH_CALL(semaphore_destroy(mach_task_self(),
death), kern_res);
return (ESUCCESS);
+ } else if (thread->detached == _PTHREAD_EXITED) {
+ UNLOCK(thread->lock);
+ pthread_join(thread, NULL);
+ return ESUCCESS;
} else
{
UNLOCK(thread->lock);
/* terminated, it will be yanked out from under the mach_msg() call. */
static void _pthread_become_available(pthread_t thread) {
- recycle_msg_t msg = { { 0 } };
+ mach_msg_empty_rcv_t msg = { { 0 } };
kern_return_t ret;
+ if (thread->reply_port == MACH_PORT_NULL) {
+ thread->reply_port = mach_reply_port();
+ }
msg.header.msgh_size = sizeof msg - sizeof msg.trailer;
msg.header.msgh_remote_port = thread_recycle_port;
msg.header.msgh_local_port = MACH_PORT_NULL;
msg.header.msgh_id = (int)thread;
msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0);
- ret = mach_msg(&msg.header, MACH_SEND_MSG, msg.header.msgh_size, 0,
- MACH_PORT_NULL, MACH_MSG_TIMEOUT_NONE,
+ ret = mach_msg(&msg.header, MACH_SEND_MSG | MACH_RCV_MSG,
+ msg.header.msgh_size, sizeof msg,
+ thread->reply_port, MACH_MSG_TIMEOUT_NONE,
MACH_PORT_NULL);
while (1) {
ret = thread_suspend(thread->kernel_thread);
/* Check to see if any threads are available. Return immediately */
-static kern_return_t _pthread_check_for_available_threads(recycle_msg_t *msg) {
+static kern_return_t _pthread_check_for_available_threads(mach_msg_empty_rcv_t *msg) {
return mach_msg(&msg->header, MACH_RCV_MSG|MACH_RCV_TIMEOUT, 0,
- sizeof(recycle_msg_t), thread_recycle_port, 0,
+ sizeof(mach_msg_empty_rcv_t), thread_recycle_port, 0,
MACH_PORT_NULL);
}
/* Terminate all available threads and deallocate their stacks */
static void _pthread_reap_threads(void) {
kern_return_t ret;
- recycle_msg_t msg = { { 0 } };
- while(_pthread_check_for_available_threads(&msg) == KERN_SUCCESS) {
+ mach_msg_empty_rcv_t msg = { { 0 } };
+ while((ret = _pthread_check_for_available_threads(&msg)) == KERN_SUCCESS) {
pthread_t th = (pthread_t)msg.header.msgh_id;
mach_port_t kernel_thread = th->kernel_thread;
mach_port_t reply_port = th->reply_port;
}
free(th);
}
+ assert(ret == MACH_RCV_TIMED_OUT);
}
-
-static void *
-stackAddress(void)
-{
- unsigned dummy;
- return (void *)((unsigned)&dummy & ~ (PTHREAD_STACK_MIN - 1));
-}
-
-extern pthread_t _pthread_self(void);
+/* For compatibility... */
pthread_t
-pthread_self(void)
-{
- void * myStack = (void *)0;
- pthread_t cachedThread = _cachedThread;
- if (cachedThread) {
- myStack = stackAddress();
- if ((void *)((unsigned)(cachedThread->stackaddr - 1) & ~ (PTHREAD_STACK_MIN - 1)) == myStack) {
- return cachedThread;
- }
- }
- _cachedThread = _pthread_self();
- return _cachedThread;
+_pthread_self() {
+ return pthread_self();
}
/*
struct _pthread_handler_rec *handler;
kern_return_t kern_res;
int num_joiners;
- _clear_thread_cache();
while ((handler = self->cleanup_stack) != 0)
{
(handler->routine)(handler->arg);
UNLOCK(self->lock);
if (num_joiners > 0)
{
- swtch_pri(0);
- PTHREAD_MACH_CALL(semaphore_signal_all(self->joiners), kern_res);
+ /* POSIX says that multiple pthread_join() calls on */
+ /* the same thread are undefined so we just wake up */
+ /* the first one to join */
+ PTHREAD_MACH_CALL(semaphore_signal(self->joiners), kern_res);
}
- PTHREAD_MACH_CALL(semaphore_wait(self->death), kern_res);
+ do {
+ PTHREAD_MACH_CALL(semaphore_wait(self->death), kern_res);
+ } while (kern_res == KERN_ABORTED);
} else
UNLOCK(self->lock);
/* Destroy thread & reclaim resources */
{
thread->num_joiners++;
UNLOCK(thread->lock);
- PTHREAD_MACH_CALL(semaphore_wait(thread->joiners), kern_res);
+ do {
+ PTHREAD_MACH_CALL(semaphore_wait(thread->joiners), kern_res);
+ } while (kern_res == KERN_ABORTED);
LOCK(thread->lock);
thread->num_joiners--;
}
*value_ptr = thread->exit_value;
}
UNLOCK(thread->lock);
- swtch_pri(0);
PTHREAD_MACH_CALL(semaphore_signal(thread->death), kern_res);
return (ESUCCESS);
} else
}
attrs = &_attr;
pthread_attr_init(attrs);
- _clear_thread_cache();
- _pthread_set_self(&_thread);
+ _pthread_set_self(&_thread);
_pthread_create(&_thread, attrs, USRSTACK, mach_thread_self());
- thread = (pthread_t)malloc(sizeof(struct _pthread));
- memcpy(thread, &_thread, sizeof(struct _pthread));
- _clear_thread_cache();
- _pthread_set_self(thread);
+ thread = &_thread;
thread->detached = _PTHREAD_CREATE_PARENT;
/* See if we're on a multiprocessor and set _spin_tries if so. */
len = sizeof(numcpus);
if (sysctl(mib, 2, &numcpus, &len, NULL, 0) == 0) {
if (numcpus > 1) {
- _spin_tries = SPIN_TRIES;
+ _spin_tries = MP_SPIN_TRIES;
}
} else {
count = HOST_BASIC_INFO_COUNT;
printf("host_info failed (%d)\n", kr);
else {
if (basic_info.avail_cpus > 1)
- _spin_tries = SPIN_TRIES;
+ _spin_tries = MP_SPIN_TRIES;
/* This is a crude test */
if (basic_info.cpu_subtype >= CPU_SUBTYPE_POWERPC_7400)
_cpu_has_altivec = 1;
if ((res = pthread_mutex_lock(mutex)) != ESUCCESS) {
return (res);
}
- if (kern_res == KERN_SUCCESS) {
+ /* KERN_ABORTED can be treated as a spurious wakeup */
+ if ((kern_res == KERN_SUCCESS) || (kern_res == KERN_ABORTED)) {
return (ESUCCESS);
} else if (kern_res == KERN_OPERATION_TIMED_OUT) {
return (ETIMEDOUT);
/* Number of times to spin when the lock is unavailable and we are on a
multiprocessor. On a uniprocessor we yield the processor immediately. */
-#define SPIN_TRIES 10
+#define MP_SPIN_TRIES 1000
extern int _spin_tries;
extern int __is_threaded;
extern int _cpu_has_altivec;
/* Internal mutex locks for data structures */
-#define TRY_LOCK(v) (!__is_threaded || _spin_lock_try((pthread_lock_t *)&v))
-#if 0
-#define LOCK(v) if (__is_threaded) _spin_lock((pthread_lock_t)&v)
-#else
-#define LOCK(v) \
- if (__is_threaded) { \
- while (!_spin_lock_try((pthread_lock_t *)&v)) { \
- syscall_thread_switch(THREAD_NULL, SWITCH_OPTION_WAIT, 1); \
- } \
- }
-#endif
-#define UNLOCK(v) if (__is_threaded) _spin_unlock((pthread_lock_t *)&v)
+#define TRY_LOCK(v) (!__is_threaded || _spin_lock_try((pthread_lock_t *)&(v)))
+#define LOCK(v) \
+do { \
+ if (__is_threaded) { \
+ int tries = _spin_tries; \
+ \
+ while (!_spin_lock_try((pthread_lock_t *)&(v))) { \
+ if (tries-- > 0) \
+ continue; \
+ \
+ syscall_thread_switch(THREAD_NULL, SWITCH_OPTION_DEPRESS, 1); \
+ tries = _spin_tries; \
+ } \
+ } \
+} while (0)
+#define UNLOCK(v) \
+do { \
+ if (__is_threaded) \
+ _spin_unlock((pthread_lock_t *)&(v)); \
+} while (0)
+
#ifndef ESUCCESS
#define ESUCCESS 0
#endif
mutex->sem = new_sem_from_pool();
}
UNLOCK(mutex->lock);
- PTHREAD_MACH_CALL(semaphore_wait(mutex->sem), kern_res);
+ do {
+ PTHREAD_MACH_CALL(semaphore_wait(mutex->sem), kern_res);
+ } while (kern_res == KERN_ABORTED);
LOCK(mutex->lock);
mutex->waiters--;
if (mutex->waiters == 0) {
#define BUF (MAXEXP+MAXFRACT+1) /* + decimal point */
#define DEFPREC 6
-static char *cvt __P((double, int, int, char *, int *, int, int *));
+static char *cvt __P((double, int, int, char *, int *, int, int *, char **));
static int exponent __P((char *, int, int));
#else /* no FLOATING_POINT */
int expsize = 0; /* character count for expstr */
int ndig; /* actual number of digits returned by cvt */
char expstr[7]; /* buffer for exponent string */
+ char *dtoaresult; /* buffer allocated by dtoa */
#endif
u_long ulval = 0; /* integer arguments %[diouxX] */
u_quad_t uqval = 0; /* %q integers */
} else { \
val = GETARG (int); \
}
-
-
+#ifdef FLOATING_POINT
+ dtoaresult = NULL;
+#endif
/* FLOCKFILE(fp); */
/* sorry, fprintf(read_only_file, "") returns EOF, not 0 */
if (cantwrite(fp)) {
}
flags |= FPT;
cp = cvt(_double, prec, flags, &softsign,
- &expt, ch, &ndig);
+ &expt, ch, &ndig, &dtoaresult);
if (ch == 'g' || ch == 'G') {
if (expt <= -4 || expt > prec)
ch = (ch == 'g') ? 'e' : 'E';
done:
FLUSH();
error:
+#ifdef FLOATING_POINT
+ if (dtoaresult != NULL)
+ free(dtoaresult);
+#endif
if (__sferror(fp))
ret = EOF;
/* FUNLOCKFILE(fp); */
* Find all arguments when a positional parameter is encountered. Returns a
* table, indexed by argument number, of pointers to each arguments. The
* initial argument table should be an array of STATIC_ARG_TBL_SIZE entries.
- * It will be replaces with a malloc-ed on if it overflows.
+ * It will be replaces with a malloc-ed one if it overflows.
*/
static void
__find_arguments (fmt0, ap, argtable)
#define ADDTYPE(type) \
((nextarg >= tablesize) ? \
__grow_type_table(nextarg, &typetable, &tablesize) : 0, \
- typetable[nextarg++] = type, \
- (nextarg > tablemax) ? tablemax = nextarg : 0)
+ (nextarg > tablemax) ? tablemax = nextarg : 0, \
+ typetable[nextarg++] = type)
#define ADDSARG() \
((flags&LONGINT) ? ADDTYPE(T_LONG) : \
unsigned char **typetable;
int *tablesize;
{
- unsigned char *oldtable = *typetable;
- int newsize = *tablesize * 2;
-
- if (*tablesize == STATIC_ARG_TBL_SIZE) {
- *typetable = (unsigned char *)
- malloc (sizeof (unsigned char) * newsize);
- bcopy (oldtable, *typetable, *tablesize);
+ unsigned char *const oldtable = *typetable;
+ const int oldsize = *tablesize;
+ unsigned char *newtable;
+ int newsize = oldsize * 2;
+
+ if (newsize < nextarg + 1)
+ newsize = nextarg + 1;
+ if (oldsize == STATIC_ARG_TBL_SIZE) {
+ if ((newtable = malloc (newsize)) == NULL)
+ abort(); /* XXX handle better */
+ bcopy (oldtable, newtable, oldsize);
} else {
- *typetable = (unsigned char *)
- realloc (typetable, sizeof (unsigned char) * newsize);
-
+ if ((newtable = realloc (oldtable, newsize)) == NULL)
+ abort(); /* XXX handle better */
}
- memset (&typetable [*tablesize], T_UNUSED, (newsize - *tablesize));
+ memset (&newtable [oldsize], T_UNUSED, (newsize - oldsize));
+ *typetable = newtable;
*tablesize = newsize;
}
#ifdef FLOATING_POINT
-extern char *__dtoa __P((double, int, int, int *, int *, char **));
+extern char *__dtoa __P((double, int, int, int *, int *, char **, char **));
static char *
-cvt(value, ndigits, flags, sign, decpt, ch, length)
+cvt(value, ndigits, flags, sign, decpt, ch, length, dtoaresultp)
double value;
int ndigits, flags, *decpt, ch, *length;
char *sign;
+ char **dtoaresultp;
{
int mode, dsgn;
char *digits, *bp, *rve;
*sign = '-';
} else
*sign = '\000';
- digits = __dtoa(value, mode, ndigits, decpt, &dsgn, &rve);
+ digits = __dtoa(value, mode, ndigits, decpt, &dsgn, &rve, dtoaresultp);
if ((ch != 'g' && ch != 'G') || flags & ALT) {
/* print trailing zeros */
bp = digits + ndigits;
#define SUPPRESS 0x08 /* suppress assignment */
#define POINTER 0x10 /* weird %p pointer (`fake hex') */
#define NOSKIP 0x20 /* do not skip blanks */
+#define QUAD 0x400
/*
* The following are used in numeric conversions only:
#define CT_CHAR 0 /* %c conversion */
#define CT_CCL 1 /* %[...] conversion */
#define CT_STRING 2 /* %s conversion */
-#define CT_INT 3 /* integer, i.e., strtol or strtoul */
+#define CT_INT 3 /* integer, i.e., strtoq or strtouq */
#define CT_FLOAT 4 /* floating, i.e., strtod */
#define u_char unsigned char
#define u_long unsigned long
-static u_char *__sccl();
+static u_char *__sccl(char *, u_char *);
/*
* vfscanf
register char *p0; /* saves original value of p when necessary */
int nassigned; /* number of fields assigned */
int nread; /* number of characters consumed from fp */
- int base; /* base argument to strtol/strtoul */
- u_long (*ccfn)(); /* conversion function (strtol/strtoul) */
+ int base; /* base argument to strtoq/strtouq */
+ u_quad_t (*ccfn)(); /* conversion function (strtoq/strtouq) */
char ccltab[256]; /* character class table for %[...] */
char buf[BUF]; /* buffer for numeric conversions */
case 'l':
flags |= LONG;
goto again;
+ case 'q':
+ flags |= QUAD;
+ goto again;
case 'L':
flags |= LONGDBL;
goto again;
/* FALLTHROUGH */
case 'd':
c = CT_INT;
- ccfn = (u_long (*)())strtol;
+ ccfn = (u_quad_t (*)())strtoq;
base = 10;
break;
case 'i':
c = CT_INT;
- ccfn = (u_long (*)())strtol;
+ ccfn = (u_quad_t (*)())strtoq;
base = 0;
break;
/* FALLTHROUGH */
case 'o':
c = CT_INT;
- ccfn = strtoul;
+ ccfn = strtouq;
base = 8;
break;
case 'u':
c = CT_INT;
- ccfn = strtoul;
+ ccfn = strtouq;
base = 10;
break;
case 'x':
flags |= PFXOK; /* enable 0x prefixing */
c = CT_INT;
- ccfn = strtoul;
+ ccfn = strtouq;
base = 16;
break;
case 'p': /* pointer format is like hex */
flags |= POINTER | PFXOK;
c = CT_INT;
- ccfn = strtoul;
+ ccfn = strtouq;
base = 16;
break;
*va_arg(ap, short *) = nread;
else if (flags & LONG)
*va_arg(ap, long *) = nread;
+ else if (flags & QUAD)
+ *va_arg(ap, quad_t *) = nread;
else
*va_arg(ap, int *) = nread;
continue;
if (isupper(c))
flags |= LONG;
c = CT_INT;
- ccfn = (u_long (*)())strtol;
+ ccfn = (u_quad_t (*)())strtoq;
base = 10;
break;
}
continue;
case CT_INT:
- /* scan an integer as if by strtol/strtoul */
+ /* scan an integer as if by strtoq/strtouq */
#ifdef hardway
if (width == 0 || width > sizeof(buf) - 1)
width = sizeof(buf) - 1;
(void) ungetc(c, fp);
}
if ((flags & SUPPRESS) == 0) {
- u_long res;
+ u_quad_t res;
*p = 0;
res = (*ccfn)(buf, (char **)NULL, base);
*va_arg(ap, short *) = res;
else if (flags & LONG)
*va_arg(ap, long *) = res;
+ else if (flags & QUAD)
+ *va_arg(ap, quad_t *) = res;
else
*va_arg(ap, int *) = res;
nassigned++;
*p = 0;
res = strtod(buf,(char **) NULL);
- if (flags & LONG)
+ if (flags & LONGDBL)
+ *va_arg(ap, long double *) = res;
+ else if (flags & LONG)
*va_arg(ap, double *) = res;
else
*va_arg(ap, float *) = res;
#ifdef __cplusplus
extern "C" double strtod(const char *s00, char **se);
extern "C" char *__dtoa(double d, int mode, int ndigits,
- int *decpt, int *sign, char **rve);
+ int *decpt, int *sign, char **rve, char **resultp);
#endif
struct
typedef struct Bigint Bigint;
- static Bigint *freelist[Kmax+1];
-
static Bigint *
Balloc
#ifdef KR_headers
int x;
Bigint *rv;
- if (rv = freelist[k]) {
- freelist[k] = rv->next;
- }
- else {
- x = 1 << k;
- rv = (Bigint *)MALLOC(sizeof(Bigint) + (x-1)*sizeof(Long));
- rv->k = k;
- rv->maxwds = x;
- }
+ x = 1 << k;
+ rv = (Bigint *)malloc(sizeof(Bigint) + (x-1)*sizeof(Long));
+ rv->k = k;
+ rv->maxwds = x;
rv->sign = rv->wds = 0;
return rv;
- }
+}
static void
Bfree
(Bigint *v)
#endif
{
- if (v) {
- v->next = freelist[v->k];
- freelist[v->k] = v;
- }
- }
+ free(v);
+}
#define Bcopy(x,y) memcpy((char *)&x->sign, (char *)&y->sign, \
y->wds*sizeof(Long) + 2*sizeof(int))
__dtoa
#ifdef KR_headers
(d, mode, ndigits, decpt, sign, rve)
- double d; int mode, ndigits, *decpt, *sign; char **rve;
+ double d; int mode, ndigits, *decpt, *sign; char **rve, char **resultp;
#else
- (double d, int mode, int ndigits, int *decpt, int *sign, char **rve)
+ (double d, int mode, int ndigits, int *decpt, int *sign, char **rve, char **resultp)
#endif
{
/* Arguments ndigits, decpt, sign are similar to those
Bigint *b, *b1, *delta, *mlo, *mhi, *S;
double d2, ds, eps;
char *s, *s0;
- static Bigint *result;
- static int result_k;
-
- if (result) {
- result->k = result_k;
- result->maxwds = 1 << result_k;
- Bfree(result);
- result = 0;
- }
if (word0(d) & Sign_bit) {
/* set sign for everything, including 0's and NaNs */
if (i <= 0)
i = 1;
}
- j = sizeof(ULong);
- for(result_k = 0; sizeof(Bigint) - sizeof(ULong) + j <= i;
- j <<= 1) result_k++;
- result = Balloc(result_k);
- s = s0 = (char *)result;
+ *resultp = (char *) malloc(i + 1);
+ s = s0 = *resultp;
if (ilim >= 0 && ilim <= Quick_max && try_quick) {
if (n) {
register unsigned char *tp = t;
register const unsigned char *fp = f;
+ register unsigned char uc = c;
do {
- if ((*tp++ = *fp++) == c)
- return (t);
+ if ((*tp++ = *fp++) == uc)
+ return (tp);
} while (--n != 0);
}
return (0);
{
static int validtz = 0;
static struct timezone cached_tz = {0};
+ struct timeval localtv;
+
+ if (tzp && (tp == NULL) && (validtz == 0)) {
+ tp = &localtv;
+ }
if (syscall (SYS_gettimeofday, tp, tzp) < 0) {
return (-1);
}
- if (validtz == 0) {
- struct tm *localtm = localtime ((time_t *)&tp->tv_sec);
- cached_tz.tz_dsttime = localtm->tm_isdst;
- cached_tz.tz_minuteswest =
- (-localtm->tm_gmtoff / SECSPERMIN) +
- (localtm->tm_isdst * MINSPERHOUR);
- validtz = 1;
- }
if (tzp) {
- tzp->tz_dsttime = cached_tz.tz_dsttime;
- tzp->tz_minuteswest = cached_tz.tz_minuteswest;
+ if (validtz == 0) {
+ struct tm *localtm = localtime ((time_t *)&tp->tv_sec);
+ cached_tz.tz_dsttime = localtm->tm_isdst;
+ cached_tz.tz_minuteswest =
+ (-localtm->tm_gmtoff / SECSPERMIN) +
+ (localtm->tm_isdst * MINSPERHOUR);
+ validtz = 1;
+ }
+ tzp->tz_dsttime = cached_tz.tz_dsttime;
+ tzp->tz_minuteswest = cached_tz.tz_minuteswest;
}
return (0);
}
*/
#include "SYS.h"
+#if 0
LEAF(_vfork, 0)
CALL_EXTERN(__cthread_fork_prepare)
#if defined(__DYNAMIC__)
CALL_EXTERN_AGAIN(__cthread_fork_parent)
pop %eax
ret
+#else
+
+LEAF(_vfork, 0)
+ popl %ecx
+ movl $SYS_vfork,%eax; // code for vfork -> eax
+ UNIX_SYSCALL_TRAP; // do the system call
+ jnb L1 // jump if CF==0
+ pushl %ecx
+ BRANCH_EXTERN(cerror)
+
+L1:
+ orl %edx,%edx // CF=OF=0, ZF set if zero result
+ jz L2 // parent, since r1 == 0 in parent, 1 in child
+ xorl %eax,%eax // zero eax
+ jmp *%ecx
+
+L2:
+ jmp *%ecx
+
+#endif
* 8 September 1998 Matt Watson (mwatson@apple.com)
* Created. Derived from longjmp.s
*/
-#include "SYS.h"
+
#include <architecture/ppc/asm_help.h>
#include "_setjmp.h"
+#define VRSave 256
+
+/* special flag bit definitions copied from /osfmk/ppc/thread_act.h */
+
+#define floatUsedbit 1
+#define vectorUsedbit 2
+
+
+#if defined(__DYNAMIC__)
+ .data
+ .non_lazy_symbol_pointer
+ .align 2
+L_memmove$non_lazy_ptr:
+ .indirect_symbol _memmove
+ .long 0
+ .non_lazy_symbol_pointer
+ .align 2
+L__cpu_has_altivec$non_lazy_ptr:
+ .indirect_symbol __cpu_has_altivec
+ .long 0
+ .text
+#endif
+
LEAF(__longjmp)
+
+ ; need to restore FPRs or VRs?
+
+ lwz r5,JMP_flags(r3)
+ lwz r6,JMP_addr_at_setjmp(r3)
+ rlwinm r7,r5,0,vectorUsedbit,vectorUsedbit
+ rlwinm r8,r5,0,floatUsedbit,floatUsedbit
+ cmpw cr1,r3,r6 ; jmp_buf still at same address?
+ cmpwi cr3,r7,0 ; set cr3 iff VRs in use (non-volatile CR)
+ cmpwi cr4,r8,0 ; set cr4 iff FPRs in use (non-volatile CR)
+ beq+ cr1,LRestoreVRs
+
+ ; jmp_buf was moved since setjmp (or is uninitialized.)
+ ; We must move VRs and FPRs to be quadword aligned at present address.
+
+ stw r3,JMP_addr_at_setjmp(r3) ; update, in case we longjmp to this again
+ mr r31,r4 ; save "val" arg across memmove
+ mr r30,r3 ; and jmp_buf ptr
+ addi r3,r3,JMP_vr_base_addr
+ addi r4,r6,JMP_vr_base_addr
+ rlwinm r3,r3,0,0,27 ; r3 <- QW aligned addr where they should be
+ rlwinm r4,r4,0,0,27 ; r4 <- QW aligned addr where they originally were
+ sub r7,r4,r6 ; r7 <- offset of VRs/FPRs within jmp_buf
+ add r4,r30,r7 ; r4 <- where they are now
+ li r5,(JMP_buf_end - JMP_vr_base_addr)
+#if defined(__DYNAMIC__)
+ bcl 20,31,1f ; Get pic-base
+1: mflr r12
+ addis r12, r12, ha16(L_memmove$non_lazy_ptr - 1b)
+ lwz r12, lo16(L_memmove$non_lazy_ptr - 1b)(r12)
+ mtctr r12 ; Get address left by dyld
+ bctrl
+#else
+ bl _memmove
+#endif
+ mr r3,r30
+ mr r4,r31
+
+ ; Restore VRs iff any
+ ; cr3 - bne if VRs
+ ; cr4 - bne if FPRs
+
+LRestoreVRs:
+ beq+ cr3,LZeroVRSave ; no VRs
+ lwz r0,JMP_vrsave(r3)
+ addi r6,r3,JMP_vr_base_addr
+ cmpwi r0,0 ; any live VRs?
+ mtspr VRSave,r0
+ beq+ LRestoreFPRs
+ lvx v20,0,r6
+ li r7,16*1
+ lvx v21,r7,r6
+ li r7,16*2
+ lvx v22,r7,r6
+ li r7,16*3
+ lvx v23,r7,r6
+ li r7,16*4
+ lvx v24,r7,r6
+ li r7,16*5
+ lvx v25,r7,r6
+ li r7,16*6
+ lvx v26,r7,r6
+ li r7,16*7
+ lvx v27,r7,r6
+ li r7,16*8
+ lvx v28,r7,r6
+ li r7,16*9
+ lvx v29,r7,r6
+ li r7,16*10
+ lvx v30,r7,r6
+ li r7,16*11
+ lvx v31,r7,r6
+ b LRestoreFPRs ; skip zeroing VRSave
+
+ ; Zero VRSave iff Altivec is supported, but VRs were not in use
+ ; at setjmp time. This covers the case where VRs are first used after
+ ; the setjmp but before the longjmp, and where VRSave is nonzero at
+ ; the longjmp. We need to zero it now, or it will always remain
+ ; nonzero since they are sticky bits.
+
+LZeroVRSave:
+#if defined(__DYNAMIC__)
+ bcl 20,31,1f
+1: mflr r9 ; get our address
+ addis r6,r9,ha16(L__cpu_has_altivec$non_lazy_ptr - 1b)
+ lwz r7,lo16(L__cpu_has_altivec$non_lazy_ptr - 1b)(r6)
+ lwz r7,0(r7) ; load the flag
+#else
+ lis r7, ha16(__cpu_has_altivec)
+ lwz r7, lo16(__cpu_has_altivec)(r7)
+#endif
+ cmpwi r7,0
+ li r8,0
+ beq LRestoreFPRs ; no Altivec, so skip
+ mtspr VRSave,r8
+
+ ; Restore FPRs if any
+ ; cr4 - bne iff FPRs
+
+LRestoreFPRs:
+ beq cr4,LRestoreGPRs ; FPRs not in use at setjmp
+ addi r6,r3,JMP_fp_base_addr
+ rlwinm r6,r6,0,0,27 ; mask off low 4 bits to qw align
+ lfd f14,0*8(r6)
+ lfd f15,1*8(r6)
+ lfd f16,2*8(r6)
+ lfd f17,3*8(r6)
+ lfd f18,4*8(r6)
+ lfd f19,5*8(r6)
+ lfd f20,6*8(r6)
+ lfd f21,7*8(r6)
+ lfd f22,8*8(r6)
+ lfd f23,9*8(r6)
+ lfd f24,10*8(r6)
+ lfd f25,11*8(r6)
+ lfd f26,12*8(r6)
+ lfd f27,13*8(r6)
+ lfd f28,14*8(r6)
+ lfd f29,15*8(r6)
+ lfd f30,16*8(r6)
+ lfd f31,17*8(r6)
+
+ ; Restore GPRs
+
+LRestoreGPRs:
lwz r31, JMP_r31(r3)
/* r1, r14-r30 */
lwz r1, JMP_r1 (r3)
*
*/
+/* NOTE: jmp_bufs are only 4-byte aligned. This means we
+ * need to pad before the VR and FPR save areas, so that they
+ * can be naturally aligned in the buffer. In case a jmp_buf
+ * is bcopy'd to a different alignment between the setjmp
+ * and longjmp, we need to save the jmp_buf address in the
+ * jmp_buf at setjmp time, so we can realign before reloading.
+ */
+
#define JMP_r1 0x00
#define JMP_r2 0x04
#define JMP_r13 0x08
#define JMP_xer 0x60
#define JMP_sig 0x64
#define JMP_SIGFLAG 0x68
+#define JMP_flags 0x6c
+#define JMP_vrsave 0x70
+#define JMP_addr_at_setjmp 0x74
+/* 12 bytes padding here */
+#define JMP_vr_base_addr 0x84
+/* save room for 12 VRs (v20-v31), or 0xC0 bytes */
+#define JMP_fp_base_addr 0x144
+/* save room for 18 FPRs (f14-f31), or 0x90 bytes */
+#define JMP_buf_end 0x1d4
+
* Created. Derived from setjmp.s
*/
-#include "SYS.h"
+
#include <architecture/ppc/asm_help.h>
#include "_setjmp.h"
+#define VRSave 256
+
+/* special flag bit definitions copied from /osfmk/ppc/thread_act.h */
+
+#define floatUsedbit 1
+#define vectorUsedbit 2
+
+#define FlagsFastTrap 0x7FF3
+
+
LEAF(__setjmp)
stw r31, JMP_r31(r3)
/* r1, r2, r13-r30 */
stw r5, JMP_lr(r3)
stw r6, JMP_ctr(r3)
stw r7, JMP_xer(r3)
- li r3, 0
+
+ mr r31,r3 ; save jmp_buf ptr
+ li r0,FlagsFastTrap
+ sc ; get FPR-inuse and VR-inuse flags from kernel
+ rlwinm r4,r3,0,floatUsedbit,floatUsedbit
+ rlwinm. r5,r3,0,vectorUsedbit,vectorUsedbit
+ cmpwi cr1,r4,0 ; set CR1 bne iff FPRs in use
+ stw r3,JMP_flags(r31)
+ stw r31,JMP_addr_at_setjmp(r31)
+ mr r3,r31 ; restore jmp_buf ptr
+ lwz r31,JMP_r31(r31)
+ beq LSaveFPRsIfNecessary ; skip if vectorUsedbit was 0
+
+ ; must save VRs and VRSAVE
+
+ mfspr r4,VRSave
+ andi. r0,r4,0xFFF ; we only care about v20-v31
+ stw r0,JMP_vrsave(r3) ; set up effective VRSAVE
+ beq LSaveFPRsIfNecessary ; no live non-volatile VRs
+ addi r6,r3,JMP_vr_base_addr
+ stvx v20,0,r6
+ li r4,16*1
+ stvx v21,r4,r6
+ li r4,16*2
+ stvx v22,r4,r6
+ li r4,16*3
+ stvx v23,r4,r6
+ li r4,16*4
+ stvx v24,r4,r6
+ li r4,16*5
+ stvx v25,r4,r6
+ li r4,16*6
+ stvx v26,r4,r6
+ li r4,16*7
+ stvx v27,r4,r6
+ li r4,16*8
+ stvx v28,r4,r6
+ li r4,16*9
+ stvx v29,r4,r6
+ li r4,16*10
+ stvx v30,r4,r6
+ li r4,16*11
+ stvx v31,r4,r6
+
+ ; must save FPRs if they are live in this thread
+ ; CR1 = bne iff FPRs are in use
+
+LSaveFPRsIfNecessary:
+ beq cr1,LExit ; FPRs not in use
+ addi r6,r3,JMP_fp_base_addr
+ rlwinm r6,r6,0,0,27 ; mask off low 4 bits to qw align
+ stfd f14,0*8(r6)
+ stfd f15,1*8(r6)
+ stfd f16,2*8(r6)
+ stfd f17,3*8(r6)
+ stfd f18,4*8(r6)
+ stfd f19,5*8(r6)
+ stfd f20,6*8(r6)
+ stfd f21,7*8(r6)
+ stfd f22,8*8(r6)
+ stfd f23,9*8(r6)
+ stfd f24,10*8(r6)
+ stfd f25,11*8(r6)
+ stfd f26,12*8(r6)
+ stfd f27,13*8(r6)
+ stfd f28,14*8(r6)
+ stfd f29,15*8(r6)
+ stfd f30,16*8(r6)
+ stfd f31,17*8(r6)
+
+LExit:
+ li r3, 0
blr
*/
.text
.align 2
- .globl __pthread_self
-__pthread_self:
+ .globl _pthread_self
+_pthread_self:
li r0, 0x7FF2
sc
blr
*
*/
-#if 1
+#if 0
#import <sys/syscall.h>
#import <architecture/ppc/asm_help.h>
#import <architecture/ppc/pseudo_inst.h>
HFILES = cthread_internals.h cthreads.h
-CFILES = cprocs.c cthreads.c lu_utils.c mig_support.c threads_data.c
+CFILES = cprocs.c cthreads.c lu_utils.c mig_support.c
SUBPROJECTS = i386.subproj ppc.subproj
DYNAMIC_CODE_GEN = YES;
FILESTABLE = {
H_FILES = (cthread_internals.h, cthreads.h);
- OTHER_LINKED = (cprocs.c, cthreads.c, lu_utils.c, mig_support.c, threads_data.c);
+ OTHER_LINKED = (cprocs.c, cthreads.c, lu_utils.c, mig_support.c);
OTHER_SOURCES = (Makefile.preamble, Makefile, Makefile.postamble);
PROJECT_HEADERS = (cthread_internals.h, cthreads.h);
SUBPROJECTS = (i386.subproj, ppc.subproj);
}
void *
-_pthread_self()
+pthread_self()
{
asm("movl $0, %eax");
asm("lcall $0x3b, $0");
+++ /dev/null
-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- *
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License"). You may not use this file except in compliance with the
- * License. Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- *
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- *
- * @APPLE_LICENSE_HEADER_END@
- */
-/*
- * This file contains global data and the size of the global data can NOT
- * change or otherwise it would make the shared library incompatable. It
- * is padded so that new data can take the place of storage occupied by part
- * of it.
- */
-int msg_send_timeout = 100; /* milliseconds */
-int msg_receive_timeout = 10; /* milliseconds */
-int mutex_spin_limit = 0;
-int cthread_stack_mask = 0;
-extern void cthread_init();
-unsigned int cproc_default_stack_size = 1000000;
-int condition_spin_limit = 0;
-int condition_yield_limit = 7;
-unsigned int initial_stack_boundary = 0;
-unsigned int cthread_stack_base = 0; /* Base for stack allocation */
-int malloc_lock = 0; /*
- * Needs to be shared between malloc.o
- * and malloc_utils.o
- */
-
-/* global data padding, must NOT be static */
-char _threads_data_padding[208] = { 0 };
else
ttygid = -1;
- for (cp1 = "pqrs"; *cp1; cp1++) {
+ for (cp1 = "pqrstuvwxy"; *cp1; cp1++) {
line[8] = *cp1;
for (cp2 = "0123456789abcdef"; *cp2; cp2++) {
line[5] = 'p';