X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/d7e50217d7adf6e52786a38bcaa4cd698cb9a79e..4a3eedf9ecc9bbe3f3a5c6ce5e53ad199d639d32:/osfmk/ppc/commpage/bigcopy_970.s?ds=inline

diff --git a/osfmk/ppc/commpage/bigcopy_970.s b/osfmk/ppc/commpage/bigcopy_970.s
index fa9e1245a..add093ea3 100644
--- a/osfmk/ppc/commpage/bigcopy_970.s
+++ b/osfmk/ppc/commpage/bigcopy_970.s
@@ -1,16 +1,19 @@
 /*
  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
  *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
  * 
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
@@ -20,14 +23,15 @@
  * Please see the License for the specific language governing rights and
  * limitations under the License.
  * 
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /* ====================================
  * Very Long Operand BCOPY for Mac OS X
  * ====================================
  *
- * Version of 6/11/2003, tuned for the IBM 970.  This is for operands at
- * least several pages long.  It is called from bcopy()/memcpy()/memmove().
+ * Version of 2/21/2004, tuned for the IBM 970.  This is for operands at
+ * least several pages long.  It is called from bcopy()/memcpy()/memmove(),
+ * and runs both in 32 and 64-bit mode.
  *
  * We use the following additional strategies not used by the shorter
  * operand paths.  Mostly, we try to optimize for memory bandwidth:
@@ -39,58 +43,40 @@
  *     which is amortized across the very long operand.
  *	2. Copy larger chunks per iteration to minimize R/W bus turnaround
  *     and maximize DRAM page locality (opening a new page is expensive.)
+ *     We use 256-byte chunks.
  *  3. Touch in one source chunk ahead with DCBT.  This is probably the
  *     least important change, and probably only helps restart the
  *     hardware stream at the start of each source page.
- *
- * Register usage.  Note the rather delicate way we assign multiple uses
- * to the same register.  Beware.
- *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
- *   r3  = not used, as memcpy and memmove return 1st parameter as a value
- *   r4  = source ptr ("rs")
- *   r5  = count of bytes to move ("rc")
- *   r6  = constant 16 ("c16")
- *   r7  = constant 32 (""c32")
- *   r8  = constant 48 (""c48")
- *   r9  = constant 128 (""c128")
- *   r10 = vrsave ("rv")
- *   r11 = constant 256 (""c256")
- *   r12 = destination ptr ("rd")
- *	 r13 = constant 384 (""c384")
- *	 r14 = temp ("rx")
- *	 r15 = temp ("rt")
  */
-#define rs	r4
-#define rd	r12
-#define rc	r5
-#define	rv	r10
-#define	rx	r14
-#define	rt	r15
-
-#define c16	r6
-#define c32	r7
-#define c48	r8
-#define	c128	r9
-#define	c256	r11
-#define	c384	r13
+ 
+#define rs	r13
+#define rd	r14
+#define rc	r15
+#define rx  r16
+
+#define c16     r3
+#define c32     r4
+#define c48     r5
+#define c64     r6
+#define c80     r7
+#define c96     r8
+#define c112    r9
+#define	c256	r10
+#define	c384	r11
+#define rv      r12     // vrsave
 
 // Offsets within the "red zone" (which is 224 bytes long):
 
-#define rzR13	-8
-#define rzR14	-12
-#define rzR15	-16
-#define rzV20	-32
-#define rzV21	-48
-#define rzV22	-64
-#define rzV23	-80
-#define rzV24	-96
-#define rzV25	-112
-#define rzV26	-128
-#define rzV27	-144
-#define rzV28	-160
-#define rzV29	-176
-#define rzV30	-192
-#define rzV31	-208
+#define rzR3    -8
+#define rzR13	-16
+#define rzR14	-24
+#define rzR15   -32
+#define rzR16   -40
+
+#define rzV20	-64
+#define rzV21	-80
+#define rzV22	-96
+#define rzV23	-112
 
 
 #include <sys/appleapiopts.h>
@@ -99,401 +85,247 @@
 #include <machine/commpage.h>
 
         .text
-        .globl	EXT(bigcopy_970)
-
+/*
+ * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
+ * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
+ * simple transformations:
+ *      - all word compares are changed to doubleword
+ *      - all "srwi[.]" opcodes are changed to "srdi[.]"                      
+ * Nothing else is done.  For this to work, the following rules must be
+ * carefully followed:
+ *      - do not use carry or overflow
+ *      - only use record mode if you are sure the results are mode-invariant
+ *        for example, all "andi." and almost all "rlwinm." are fine
+ *      - do not use "slwi", "slw", or "srw"
+ * An imaginative programmer could break the porting model in other ways, but the above
+ * are the most likely problem areas.  It is perhaps surprising how well in practice
+ * this simple method works.
+ */
 
 // Entry point.  This is a subroutine of bcopy().  When called:
-//	r4 = source ptr (aka "rs")
-// r12 = dest ptr (aka "rd")
-//	r5 = length (>= 16K bytes) (aka "rc")
+//  r0 = return address (also stored in caller's SF)
+//	r4 = source ptr
+//	r5 = length (at least several pages)
+// r12 = dest ptr
 // 
-// We only do "forward" moves, ie non-overlapping or toward 0.
-//
-// We return with non-volatiles and r3 preserved.
+// We only do "forward" moves, ie non-overlapping or toward 0.  We return with non-volatiles
+// and r3 preserved.
 
         .align 	5
 bigcopy_970:
-        stw		r13,rzR13(r1)		// spill non-volatile regs we use to redzone
-        stw		r14,rzR14(r1)
-        stw		r15,rzR15(r1)
-        li		r0,rzV20
-        neg		rt,rd				// start to cache-line-align destination
-        stvx	v20,r1,r0			// we use all 32 VRs
-        li		r0,rzV21
-        stvx	v21,r1,r0
-        li		r0,rzV22
-        stvx	v22,r1,r0
-        li		r0,rzV23
-        stvx	v23,r1,r0
-        li		r0,rzV24
-        andi.	rt,rt,127			// get #bytes to 128-byte align
-        stvx	v24,r1,r0
-        li		r0,rzV25
-        stvx	v25,r1,r0
-        li		r0,rzV26
-        sub		rc,rc,rt			// adjust length by #bytes to align destination
-        stvx	v26,r1,r0
-        li		r0,rzV27
-        stvx	v27,r1,r0
-        li		r0,rzV28
-        mtctr	rt					// #bytes to align destination
-        stvx	v28,r1,r0
-        li		r0,rzV29
-        stvx	v29,r1,r0
-        li		r0,rzV30
-        stvx	v30,r1,r0
-        li		r0,rzV31
-        stvx	v31,r1,r0
-        beq		2f					// dest already 128-byte aligned
-        b		1f
-
+        neg     r2,r12              // is destination cache-line-aligned?
+        std     r3,rzR3(r1)         // save caller's r3, which must be preserved for memcpy()
+        std		r13,rzR13(r1)		// spill non-volatile regs we use to redzone
+        std		r14,rzR14(r1)
+        std		r15,rzR15(r1)
+        andi.   r2,r2,0x7F          // #bytes to align
+        std     r16,rzR16(r1)
+        mr      rs,r4               // copy parameters into nonvolatile registers
+        mr      rd,r12
+        mr      rc,r5
+        mr      rx,r0               // also save return address
+        beq     1f                  // skip if already aligned
 
 // Cache-line-align destination.
-
-        .align	5
-1:
-        lbz		r0,0(rs)
-        addi	rs,rs,1
-        stb		r0,0(rd)
-        addi	rd,rd,1
-        bdnz	1b
+        
+        mr      r3,rd               // set up dest ptr for memcpy()
+        mr      r5,r2               // number of bytes to copy
+        add     rs,rs,r2            // then bump our parameters past initial copy
+        add     rd,rd,r2
+        sub     rc,rc,r2
+        bla     _COMM_PAGE_MEMCPY   // 128-byte-align destination
 
 
-// Is source 16-byte aligned?  Load constant offsets.
+// Load constant offsets and check whether source is 16-byte aligned.
+// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
+// and we dcbz only if cr7 beq is set.
 
-2:
+1:
+        dcbt    0,rs                // touch in 1st line of source
         andi.	r0,rs,15			// check source alignment
         mfspr	rv,vrsave			// save caller's bitmask
-        li		r0,-1				// we use all 32 VRs
         li		c16,16				// load the constant offsets for x-form ops
         li		c32,32
+        srwi    r2,rc,8             // get number of 256-byte chunks to xfer
+        li		r0,-256				// we use 24 VRs (ie, 0-23)
         li		c48,48
-        li		c128,128
+        li      c64,64
+        li      c80,80
+        or      r0,r0,rv            // add our bits to caller's
+        li      c96,96
+        mtctr   r2                  // set up loop count
+        li      c112,112
+        cmpd    cr7,r2,r2           // initialize cr7_eq to "on", so we dcbz128
+        mtspr	vrsave,r0           // say we use vr0..vr23
         li		c256,256
         li		c384,384
-        mtspr	vrsave,r0
-
-// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
-// and we dcbz only if cr7 beq is set.  We check to be sure the dcbz's
-// won't zero source bytes before we load them, since we zero before
-// loading as this is faster than zeroing after loading and before storing.
+        beq		LalignedLoop		// handle aligned sources
 
-        cmpw	cr7,r0,r0			// initialize cr7 beq to use dcbz128
-        sub		rt,rs,rd			// get (rs-rd)
-        cmplwi	cr1,rt,512			// are we moving down less than 512 bytes?
         
-// Start fetching in source cache lines.
+// Set up for unaligned loop.
 
-        dcbt	c128,rs				// first line already touched in
-        dcbt	c256,rs
-        dcbt	c384,rs
-        
-        bge++	cr1,3f				// skip if not moving down less than 512 bytes
-        cmpw	cr7,c16,c32			// cannot dcbz since it would zero source bytes
-3:
-        beq		LalignedLoop		// handle aligned sources
         lvsl	v0,0,rs				// get permute vector for left shift
         lvxl	v1,0,rs				// prime the loop
+        li		r0,rzV20            // save non-volatile VRs in redzone
+        stvx	v20,r1,r0
+        li		r0,rzV21
+        stvx	v21,r1,r0
+        li		r0,rzV22
+        stvx	v22,r1,r0
+        li		r0,rzV23
+        stvx	v23,r1,r0
         b		LunalignedLoop		// enter unaligned loop
 
 
-// Main loop for unaligned operands.  We loop over 384-byte chunks (3 cache lines)
-// since we need a few VRs for permuted destination QWs and the permute vector.
+// Main loop for unaligned operands.  We loop over 256-byte chunks (2 cache lines).
+// Destination is 128-byte aligned, source is unaligned.
 
         .align	5
 LunalignedLoop:
-        subi	rc,rc,384			// decrement byte count
-        addi	rx,rs,384			// get address of next chunk
+        dcbt	c256,rs             // touch in next chunk
+        dcbt	c384,rs
+        addi    r2,rs,128           // point to 2nd 128 bytes of source
         lvxl	v2,c16,rs
         lvxl	v3,c32,rs
+        lvxl	v4,c48,rs
+        lvxl    v5,c64,rs
+        lvxl    v6,c80,rs
+        lvxl    v7,c96,rs
+        lvxl    v8,c112,rs
+        lvxl    v9,0,r2
+        addi    rs,rs,256           // point to next source chunk
+        lvxl    v10,c16,r2
+        lvxl    v11,c32,r2
+        vperm   v17,v1,v2,v0
+        lvxl    v12,c48,r2
+        lvxl    v13,c64,r2
+        vperm   v18,v2,v3,v0
+        lvxl    v14,c80,r2
+        lvxl    v15,c96,r2
+        vperm   v19,v3,v4,v0
+        lvxl    v16,c112,r2
+        lvxl	v1,0,rs             // peek ahead at first source quad in next chunk
+        vperm   v20,v4,v5,v0
+        addi    r2,rd,128           // point to 2nd 128 bytes of dest 
         bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
-        dcbz128	0,rd				// (also skip if moving down less than 512 bytes)
-        bne--	cr7,1f				// catch it first time through
-        dcbz128	c128,rd
-        dcbz128	c256,rd
+        dcbz128	0,rd
+        dcbz128	0,r2
 1:
-        addi	rt,rs,64
-        dcbt	0,rx				// touch in next chunk
-        dcbt	c128,rx
-        dcbt	c256,rx
-        lvxl	v4,c48,rs
-        addi	rs,rs,128
-        lvxl	v5,0,rt
-        cmplwi	rc,384				// another chunk to go?
-        lvxl	v6,c16,rt
-        lvxl	v7,c32,rt
-        lvxl	v8,c48,rt
-        addi	rt,rs,64
-        vperm	v25,v1,v2,v0
-        lvxl	v9,0,rs
-        lvxl	v10,c16,rs
-        vperm	v26,v2,v3,v0
-        lvxl	v11,c32,rs
-        lvxl	v12,c48,rs
-        vperm	v27,v3,v4,v0
-        addi	rs,rs,128
-        lvxl	v13,0,rt
-        lvxl	v14,c16,rt
-        vperm	v28,v4,v5,v0
-        lvxl	v15,c32,rt
-        lvxl	v16,c48,rt
-        vperm	v29,v5,v6,v0
-        addi	rt,rs,64
-        lvxl	v17,0,rs
-        lvxl	v18,c16,rs
-        vperm	v30,v6,v7,v0
-        lvxl	v19,c32,rs
-        lvxl	v20,c48,rs
-        vperm	v31,v7,v8,v0
-        addi	rs,rs,128
-        lvxl	v21,0,rt
-        lvxl	v22,c16,rt
-        vperm	v2,v8,v9,v0
-        lvxl	v23,c32,rt
-        lvxl	v24,c48,rt
-        vperm	v3,v9,v10,v0
-        lvx		v1,0,rs				// get 1st qw of next chunk
-        vperm	v4,v10,v11,v0
-        
-        addi	rt,rd,64
-        stvxl	v25,0,rd
-        stvxl	v26,c16,rd
-        vperm	v5,v11,v12,v0
-        stvxl	v27,c32,rd
-        stvxl	v28,c48,rd
-        vperm	v6,v12,v13,v0
-        addi	rd,rd,128
-        stvxl	v29,0,rt
-        stvxl	v30,c16,rt
-        vperm	v7,v13,v14,v0
-        stvxl	v31,c32,rt
-        stvxl	v2,c48,rt
-        vperm	v8,v14,v15,v0
-        addi	rt,rd,64
-        stvxl	v3,0,rd
-        stvxl	v4,c16,rd
-        vperm	v9,v15,v16,v0
-        stvxl	v5,c32,rd
-        stvxl	v6,c48,rd
-        vperm	v10,v16,v17,v0
-        addi	rd,rd,128
-        stvxl	v7,0,rt
-        vperm	v11,v17,v18,v0
-        stvxl	v8,c16,rt
-        stvxl	v9,c32,rt
-        vperm	v12,v18,v19,v0
-        stvxl	v10,c48,rt
-        addi	rt,rd,64
-        vperm	v13,v19,v20,v0
-        stvxl	v11,0,rd
-        stvxl	v12,c16,rd
-        vperm	v14,v20,v21,v0
-        stvxl	v13,c32,rd
-        vperm	v15,v21,v22,v0
-        stvxl	v14,c48,rd
-        vperm	v16,v22,v23,v0
-        addi	rd,rd,128
-        stvxl	v15,0,rt
-        vperm	v17,v23,v24,v0
-        stvxl	v16,c16,rt
-        vperm	v18,v24,v1,v0
-        stvxl	v17,c32,rt
-        stvxl	v18,c48,rt
-        bge++	LunalignedLoop		// loop if another 384 bytes to go
-
-// End of unaligned main loop.  Handle up to 384 leftover bytes.
-
-        srwi.	r0,rc,5				// get count of 32-byte chunks remaining
-        beq		Ldone				// none
-        rlwinm	rc,rc,0,0x1F		// mask count down to 0..31 leftover bytes
-        mtctr	r0
-1:									// loop over 32-byte chunks
-        lvx		v2,c16,rs
-        lvx		v3,c32,rs
-        addi	rs,rs,32
-        vperm	v8,v1,v2,v0
-        vperm	v9,v2,v3,v0
-        vor		v1,v3,v3			// v1 <- v3
-        stvx	v8,0,rd
-        stvx	v9,c16,rd
-        addi	rd,rd,32
-        bdnz	1b
-        
-        b		Ldone
+        vperm   v21,v5,v6,v0
+        stvxl	v17,0,rd
+        vperm   v22,v6,v7,v0
+        stvxl	v18,c16,rd
+        vperm   v23,v7,v8,v0
+        stvxl	v19,c32,rd
+        vperm   v17,v8,v9,v0
+        stvxl	v20,c48,rd
+        vperm   v18,v9,v10,v0
+        stvxl	v21,c64,rd
+        vperm   v19,v10,v11,v0
+        stvxl	v22,c80,rd
+        vperm   v20,v11,v12,v0
+        stvxl	v23,c96,rd
+        vperm   v21,v12,v13,v0
+        stvxl	v17,c112,rd
+        vperm   v22,v13,v14,v0
+        addi	rd,rd,256           // point to next dest chunk
+        stvxl	v18,0,r2
+        vperm   v23,v14,v15,v0
+        stvxl	v19,c16,r2
+        vperm   v17,v15,v16,v0
+        stvxl	v20,c32,r2
+        vperm   v18,v16,v1,v0
+        stvxl	v21,c48,r2
+        stvxl	v22,c64,r2
+        stvxl	v23,c80,r2
+        stvxl	v17,c96,r2
+        stvxl	v18,c112,r2
+        bdnz++	LunalignedLoop      // loop if another 256 bytes to go
+
+        li		r6,rzV20            // restore non-volatile VRs
+        li		r7,rzV21
+        li		r8,rzV22
+        li		r9,rzV23
+        lvx		v20,r1,r6
+        lvx		v21,r1,r7
+        lvx		v22,r1,r8
+        lvx		v23,r1,r9
+        b       Ldone
         
         
 // Aligned loop.  Destination is 128-byte aligned, and source is 16-byte
-// aligned.  Loop over 512-byte chunks (4 cache lines.)
+// aligned.  Loop over 256-byte chunks (2 cache lines.)
 
         .align	5
 LalignedLoop:
-        subi	rc,rc,512			// decrement count
-        addi	rx,rs,512			// address of next chunk
+        dcbt	c256,rs             // touch in next chunk
+        dcbt	c384,rs
+        addi    r2,rs,128           // point to 2nd 128 bytes of source
         lvxl	v1,0,rs
         lvxl	v2,c16,rs
-        bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
-        dcbz128	0,rd				// (also skip if moving down less than 512 bytes)
-        bne--	cr7,1f				// catch it first time through
-        dcbz128	c128,rd
-        dcbz128	c256,rd
-        dcbz128	c384,rd
-1:
-        addi	rt,rs,64
-        dcbt	0,rx				// touch in next chunk
-        dcbt	c128,rx
-        dcbt	c256,rx
-        dcbt	c384,rx
         lvxl	v3,c32,rs
         lvxl	v4,c48,rs
-        addi	rs,rs,128
-        lvxl	v5,0,rt
-        cmplwi	rc,512				// another chunk to go?
-        lvxl	v6,c16,rt
-        lvxl	v7,c32,rt
-        lvxl	v8,c48,rt
-        addi	rt,rs,64
-        lvxl	v9,0,rs
-        lvxl	v10,c16,rs
-        lvxl	v11,c32,rs
-        lvxl	v12,c48,rs
-        addi	rs,rs,128
-        lvxl	v13,0,rt
-        lvxl	v14,c16,rt
-        lvxl	v15,c32,rt
-        lvxl	v16,c48,rt
-        addi	rt,rs,64
-        lvxl	v17,0,rs
-        lvxl	v18,c16,rs
-        lvxl	v19,c32,rs
-        lvxl	v20,c48,rs
-        addi	rs,rs,128
-        lvxl	v21,0,rt
-        lvxl	v22,c16,rt
-        lvxl	v23,c32,rt
-        lvxl	v24,c48,rt
-        addi	rt,rs,64
-        lvxl	v25,0,rs
-        lvxl	v26,c16,rs
-        lvxl	v27,c32,rs
-        lvxl	v28,c48,rs
-        addi	rs,rs,128
-        lvxl	v29,0,rt
-        lvxl	v30,c16,rt
-        lvxl	v31,c32,rt
-        lvxl	v0,c48,rt
-
-        addi	rt,rd,64
+        lvxl    v5,c64,rs
+        lvxl    v6,c80,rs
+        lvxl    v7,c96,rs
+        lvxl    v8,c112,rs
+        lvxl    v9,0,r2
+        lvxl    v10,c16,r2
+        lvxl    v11,c32,r2
+        lvxl    v12,c48,r2
+        lvxl    v13,c64,r2
+        lvxl    v14,c80,r2
+        lvxl    v15,c96,r2
+        lvxl    v16,c112,r2
+        addi    r2,rd,128           // point to 2nd 128 bytes of dest 
+        bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
+        dcbz128	0,rd
+        dcbz128	0,r2
+1:
+        addi    rs,rs,256           // point to next source chunk
         stvxl	v1,0,rd
         stvxl	v2,c16,rd
         stvxl	v3,c32,rd
         stvxl	v4,c48,rd
-        addi	rd,rd,128
-        stvxl	v5,0,rt
-        stvxl	v6,c16,rt
-        stvxl	v7,c32,rt
-        stvxl	v8,c48,rt
-        addi	rt,rd,64
-        stvxl	v9,0,rd
-        stvxl	v10,c16,rd
-        stvxl	v11,c32,rd
-        stvxl	v12,c48,rd
-        addi	rd,rd,128
-        stvxl	v13,0,rt
-        stvxl	v14,c16,rt
-        stvxl	v15,c32,rt
-        stvxl	v16,c48,rt
-        addi	rt,rd,64
-        stvxl	v17,0,rd
-        stvxl	v18,c16,rd
-        stvxl	v19,c32,rd
-        stvxl	v20,c48,rd
-        addi	rd,rd,128
-        stvxl	v21,0,rt
-        stvxl	v22,c16,rt
-        stvxl	v23,c32,rt
-        stvxl	v24,c48,rt
-        addi	rt,rd,64
-        stvxl	v25,0,rd
-        stvxl	v26,c16,rd
-        stvxl	v27,c32,rd
-        stvxl	v28,c48,rd
-        addi	rd,rd,128
-        stvxl	v29,0,rt
-        stvxl	v30,c16,rt
-        stvxl	v31,c32,rt
-        stvxl	v0,c48,rt
-        bge++	LalignedLoop		// loop if another 512 bytes to go
-
-// End of aligned main loop.  Handle up to 511 leftover bytes.
-
-        srwi.	r0,rc,5				// get count of 32-byte chunks remaining
-        beq		Ldone				// none
-        rlwinm	rc,rc,0,0x1F		// mask count down to 0..31 leftover bytes
-        mtctr	r0
-1:									// loop over 32-byte chunks
-        lvx		v1,0,rs
-        lvx		v2,c16,rs
-        addi	rs,rs,32
-        stvx	v1,0,rd
-        stvx	v2,c16,rd
-        addi	rd,rd,32
-        bdnz	1b
-
-
-// Done, except for 0..31 leftovers at end.  Restore non-volatiles.
+        stvxl	v5,c64,rd
+        stvxl	v6,c80,rd
+        stvxl	v7,c96,rd
+        stvxl	v8,c112,rd
+        addi	rd,rd,256           // point to next dest chunk
+        stvxl	v9,0,r2
+        stvxl	v10,c16,r2
+        stvxl	v11,c32,r2
+        stvxl	v12,c48,r2
+        stvxl	v13,c64,r2
+        stvxl	v14,c80,r2
+        stvxl	v15,c96,r2
+        stvxl	v16,c112,r2
+        bdnz++	LalignedLoop		// loop if another 256 bytes to go
+
+
+// Done, except for 0..255 leftover bytes at end.
 //	rs = source ptr
 //	rd = dest ptr
-//	rc = count (0..31)
+//	rc = remaining count in low 7 bits
 //	rv = caller's vrsave
+//  rx = caller's return address
 
 Ldone:
-        cmpwi	rc,0				// any leftover bytes?
-        lwz		r13,rzR13(r1)		// restore non-volatiles from redzone
-        lwz		r14,rzR14(r1)
-        lwz		r15,rzR15(r1)
-        li		r0,rzV20
-        lvx		v20,r1,r0
-        li		r0,rzV21
-        lvx		v21,r1,r0
-        li		r0,rzV22
-        lvx		v22,r1,r0
-        li		r0,rzV23
-        lvx		v23,r1,r0
-        li		r0,rzV24
-        lvx		v24,r1,r0
-        li		r0,rzV25
-        lvx		v25,r1,r0
-        li		r0,rzV26
-        lvx		v26,r1,r0
-        li		r0,rzV27
-        lvx		v27,r1,r0
-        li		r0,rzV28
-        lvx		v28,r1,r0
-        li		r0,rzV29
-        lvx		v29,r1,r0
-        li		r0,rzV30
-        lvx		v30,r1,r0
-        li		r0,rzV31
-        lvx		v31,r1,r0
-        mtspr	vrsave,rv			// restore caller's bitmask
-        beqlr						// done if no leftover bytes
-        
-
-// Handle 1..31 leftover bytes at end.
-
-        mtctr	rc					// set up loop count
-        b		1f
-        
-        .align	5
-1:
-        lbz		r0,0(rs)
-        addi	rs,rs,1
-        stb		r0,0(rd)
-        addi	rd,rd,1
-        bdnz	1b
+        andi.   r5,rc,0xFF          // any leftover bytes? (0..255)
+        mtspr	vrsave,rv			// restore bitmap of live vr's
         
+        mr      r3,rd
+        mr      r4,rs
+        bnela   _COMM_PAGE_MEMCPY   // copy leftover bytes
+
+        mtlr    rx                  // restore return address
+        ld      r3,rzR3(r1)         // restore non-volatile GPRs from redzone
+        ld		r13,rzR13(r1)
+        ld		r14,rzR14(r1)
+        ld		r15,rzR15(r1)
+        ld      r16,rzR16(r1)
         blr
 
 
-        COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now
+        COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)