[apple/xnu.git] / osfmk / ppc / commpage / bigcopy_970.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code 
 * as defined in and that are subject to the Apple Public Source License 
 * Version 2.0 (the 'License'). You may not use this file except in 
 * compliance with the License.  The rights granted to you under the 
 * License may not be used to create, or enable the creation or 
 * redistribution of, unlawful or unlicensed copies of an Apple operating 
 * system, or to circumvent, violate, or enable the circumvention or 
 * violation of, any terms of an Apple operating system software license 
 * agreement.
 *
 * Please obtain a copy of the License at 
 * http://www.opensource.apple.com/apsl/ and read it before using this 
 * file.
 *
 * The Original Code and all software distributed under the License are 
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
 * Please see the License for the specific language governing rights and 
 * limitations under the License.
 *
 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
 */
/* ====================================
 * Very Long Operand BCOPY for Mac OS X
 * ====================================
 *
 * Version of 2/21/2004, tuned for the IBM 970.  This is for operands at
 * least several pages long.  It is called from bcopy()/memcpy()/memmove(),
 * and runs both in 32 and 64-bit mode.
 *
 * We use the following additional strategies not used by the shorter
 * operand paths.  Mostly, we try to optimize for memory bandwidth:
 *	1. Use DCBZ128 to avoid reading destination lines.  Because this code
 *     resides on the commmpage, it can use a private interface with the
 *     kernel to minimize alignment exceptions if the destination is
 *     uncached.  The kernel will clear cr7 whenever it emulates a DCBZ or
 *     DCBZ128 on the commpage.  Thus we take at most one exception per call,
 *     which is amortized across the very long operand.
 *	2. Copy larger chunks per iteration to minimize R/W bus turnaround
 *     and maximize DRAM page locality (opening a new page is expensive.)
 *     We use 256-byte chunks.
 *  3. Touch in one source chunk ahead with DCBT.  This is probably the
 *     least important change, and probably only helps restart the
 *     hardware stream at the start of each source page.
 */
 
#define rs	r13
#define rd	r14
#define rc	r15
#define rx  r16

#define c16     r3
#define c32     r4
#define c48     r5
#define c64     r6
#define c80     r7
#define c96     r8
#define c112    r9
#define	c256	r10
#define	c384	r11
#define rv      r12     // vrsave

// Offsets within the "red zone" (which is 224 bytes long):

#define rzR3    -8
#define rzR13	-16
#define rzR14	-24
#define rzR15   -32
#define rzR16   -40

#define rzV20	-64
#define rzV21	-80
#define rzV22	-96
#define rzV23	-112


#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
/*
 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
 * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
 * simple transformations:
 *      - all word compares are changed to doubleword
 *      - all "srwi[.]" opcodes are changed to "srdi[.]"                      
 * Nothing else is done.  For this to work, the following rules must be
 * carefully followed:
 *      - do not use carry or overflow
 *      - only use record mode if you are sure the results are mode-invariant
 *        for example, all "andi." and almost all "rlwinm." are fine
 *      - do not use "slwi", "slw", or "srw"
 * An imaginative programmer could break the porting model in other ways, but the above
 * are the most likely problem areas.  It is perhaps surprising how well in practice
 * this simple method works.
 */

// Entry point.  This is a subroutine of bcopy().  When called:
//  r0 = return address (also stored in caller's SF)
//	r4 = source ptr
//	r5 = length (at least several pages)
// r12 = dest ptr
// 
// We only do "forward" moves, ie non-overlapping or toward 0.  We return with non-volatiles
// and r3 preserved.

        .align 	5
bigcopy_970:
        neg     r2,r12              // is destination cache-line-aligned?
        std     r3,rzR3(r1)         // save caller's r3, which must be preserved for memcpy()
        std		r13,rzR13(r1)		// spill non-volatile regs we use to redzone
        std		r14,rzR14(r1)
        std		r15,rzR15(r1)
        andi.   r2,r2,0x7F          // #bytes to align
        std     r16,rzR16(r1)
        mr      rs,r4               // copy parameters into nonvolatile registers
        mr      rd,r12
        mr      rc,r5
        mr      rx,r0               // also save return address
        beq     1f                  // skip if already aligned

// Cache-line-align destination.
        
        mr      r3,rd               // set up dest ptr for memcpy()
        mr      r5,r2               // number of bytes to copy
        add     rs,rs,r2            // then bump our parameters past initial copy
        add     rd,rd,r2
        sub     rc,rc,r2
        bla     _COMM_PAGE_MEMCPY   // 128-byte-align destination


// Load constant offsets and check whether source is 16-byte aligned.
// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
// and we dcbz only if cr7 beq is set.

1:
        dcbt    0,rs                // touch in 1st line of source
        andi.	r0,rs,15			// check source alignment
        mfspr	rv,vrsave			// save caller's bitmask
        li		c16,16				// load the constant offsets for x-form ops
        li		c32,32
        srwi    r2,rc,8             // get number of 256-byte chunks to xfer
        li		r0,-256				// we use 24 VRs (ie, 0-23)
        li		c48,48
        li      c64,64
        li      c80,80
        or      r0,r0,rv            // add our bits to caller's
        li      c96,96
        mtctr   r2                  // set up loop count
        li      c112,112
        cmpd    cr7,r2,r2           // initialize cr7_eq to "on", so we dcbz128
        mtspr	vrsave,r0           // say we use vr0..vr23
        li		c256,256
        li		c384,384
        beq		LalignedLoop		// handle aligned sources

        
// Set up for unaligned loop.

        lvsl	v0,0,rs				// get permute vector for left shift
        lvxl	v1,0,rs				// prime the loop
        li		r0,rzV20            // save non-volatile VRs in redzone
        stvx	v20,r1,r0
        li		r0,rzV21
        stvx	v21,r1,r0
        li		r0,rzV22
        stvx	v22,r1,r0
        li		r0,rzV23
        stvx	v23,r1,r0
        b		LunalignedLoop		// enter unaligned loop


// Main loop for unaligned operands.  We loop over 256-byte chunks (2 cache lines).
// Destination is 128-byte aligned, source is unaligned.

        .align	5
LunalignedLoop:
        dcbt	c256,rs             // touch in next chunk
        dcbt	c384,rs
        addi    r2,rs,128           // point to 2nd 128 bytes of source
        lvxl	v2,c16,rs
        lvxl	v3,c32,rs
        lvxl	v4,c48,rs
        lvxl    v5,c64,rs
        lvxl    v6,c80,rs
        lvxl    v7,c96,rs
        lvxl    v8,c112,rs
        lvxl    v9,0,r2
        addi    rs,rs,256           // point to next source chunk
        lvxl    v10,c16,r2
        lvxl    v11,c32,r2
        vperm   v17,v1,v2,v0
        lvxl    v12,c48,r2
        lvxl    v13,c64,r2
        vperm   v18,v2,v3,v0
        lvxl    v14,c80,r2
        lvxl    v15,c96,r2
        vperm   v19,v3,v4,v0
        lvxl    v16,c112,r2
        lvxl	v1,0,rs             // peek ahead at first source quad in next chunk
        vperm   v20,v4,v5,v0
        addi    r2,rd,128           // point to 2nd 128 bytes of dest 
        bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
        dcbz128	0,rd
        dcbz128	0,r2
1:
        vperm   v21,v5,v6,v0
        stvxl	v17,0,rd
        vperm   v22,v6,v7,v0
        stvxl	v18,c16,rd
        vperm   v23,v7,v8,v0
        stvxl	v19,c32,rd
        vperm   v17,v8,v9,v0
        stvxl	v20,c48,rd
        vperm   v18,v9,v10,v0
        stvxl	v21,c64,rd
        vperm   v19,v10,v11,v0
        stvxl	v22,c80,rd
        vperm   v20,v11,v12,v0
        stvxl	v23,c96,rd
        vperm   v21,v12,v13,v0
        stvxl	v17,c112,rd
        vperm   v22,v13,v14,v0
        addi	rd,rd,256           // point to next dest chunk
        stvxl	v18,0,r2
        vperm   v23,v14,v15,v0
        stvxl	v19,c16,r2
        vperm   v17,v15,v16,v0
        stvxl	v20,c32,r2
        vperm   v18,v16,v1,v0
        stvxl	v21,c48,r2
        stvxl	v22,c64,r2
        stvxl	v23,c80,r2
        stvxl	v17,c96,r2
        stvxl	v18,c112,r2
        bdnz++	LunalignedLoop      // loop if another 256 bytes to go

        li		r6,rzV20            // restore non-volatile VRs
        li		r7,rzV21
        li		r8,rzV22
        li		r9,rzV23
        lvx		v20,r1,r6
        lvx		v21,r1,r7
        lvx		v22,r1,r8
        lvx		v23,r1,r9
        b       Ldone
        
        
// Aligned loop.  Destination is 128-byte aligned, and source is 16-byte
// aligned.  Loop over 256-byte chunks (2 cache lines.)

        .align	5
LalignedLoop:
        dcbt	c256,rs             // touch in next chunk
        dcbt	c384,rs
        addi    r2,rs,128           // point to 2nd 128 bytes of source
        lvxl	v1,0,rs
        lvxl	v2,c16,rs
        lvxl	v3,c32,rs
        lvxl	v4,c48,rs
        lvxl    v5,c64,rs
        lvxl    v6,c80,rs
        lvxl    v7,c96,rs
        lvxl    v8,c112,rs
        lvxl    v9,0,r2
        lvxl    v10,c16,r2
        lvxl    v11,c32,r2
        lvxl    v12,c48,r2
        lvxl    v13,c64,r2
        lvxl    v14,c80,r2
        lvxl    v15,c96,r2
        lvxl    v16,c112,r2
        addi    r2,rd,128           // point to 2nd 128 bytes of dest 
        bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
        dcbz128	0,rd
        dcbz128	0,r2
1:
        addi    rs,rs,256           // point to next source chunk
        stvxl	v1,0,rd
        stvxl	v2,c16,rd
        stvxl	v3,c32,rd
        stvxl	v4,c48,rd
        stvxl	v5,c64,rd
        stvxl	v6,c80,rd
        stvxl	v7,c96,rd
        stvxl	v8,c112,rd
        addi	rd,rd,256           // point to next dest chunk
        stvxl	v9,0,r2
        stvxl	v10,c16,r2
        stvxl	v11,c32,r2
        stvxl	v12,c48,r2
        stvxl	v13,c64,r2
        stvxl	v14,c80,r2
        stvxl	v15,c96,r2
        stvxl	v16,c112,r2
        bdnz++	LalignedLoop		// loop if another 256 bytes to go


// Done, except for 0..255 leftover bytes at end.
//	rs = source ptr
//	rd = dest ptr
//	rc = remaining count in low 7 bits
//	rv = caller's vrsave
//  rx = caller's return address

Ldone:
        andi.   r5,rc,0xFF          // any leftover bytes? (0..255)
        mtspr	vrsave,rv			// restore bitmap of live vr's
        
        mr      r3,rd
        mr      r4,rs
        bnela   _COMM_PAGE_MEMCPY   // copy leftover bytes

        mtlr    rx                  // restore return address
        ld      r3,rzR3(r1)         // restore non-volatile GPRs from redzone
        ld		r13,rzR13(r1)
        ld		r14,rzR14(r1)
        ld		r15,rzR15(r1)
        ld      r16,rzR16(r1)
        blr


        COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
8ad349bb	4	* @APPLE_LICENSE_OSREFERENCE_HEADER_START@
55e303ae	5	*
8ad349bb A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the
	10	* License may not be used to create, or enable the creation or
	11	* redistribution of, unlawful or unlicensed copies of an Apple operating
	12	* system, or to circumvent, violate, or enable the circumvention or
	13	* violation of, any terms of an Apple operating system software license
	14	* agreement.
	15	*
	16	* Please obtain a copy of the License at
	17	* http://www.opensource.apple.com/apsl/ and read it before using this
	18	* file.
	19	*
	20	* The Original Code and all software distributed under the License are
	21	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	22	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	23	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	24	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	25	* Please see the License for the specific language governing rights and
	26	* limitations under the License.
	27	*
	28	* @APPLE_LICENSE_OSREFERENCE_HEADER_END@
55e303ae A	29	*/
	30	/* ====================================
	31	* Very Long Operand BCOPY for Mac OS X
	32	* ====================================
	33	*
91447636 A	34	* Version of 2/21/2004, tuned for the IBM 970. This is for operands at
	35	* least several pages long. It is called from bcopy()/memcpy()/memmove(),
	36	* and runs both in 32 and 64-bit mode.
55e303ae A	37	*
	38	* We use the following additional strategies not used by the shorter
	39	* operand paths. Mostly, we try to optimize for memory bandwidth:
	40	* 1. Use DCBZ128 to avoid reading destination lines. Because this code
	41	* resides on the commmpage, it can use a private interface with the
	42	* kernel to minimize alignment exceptions if the destination is
	43	* uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
	44	* DCBZ128 on the commpage. Thus we take at most one exception per call,
	45	* which is amortized across the very long operand.
	46	* 2. Copy larger chunks per iteration to minimize R/W bus turnaround
	47	* and maximize DRAM page locality (opening a new page is expensive.)
91447636	48	* We use 256-byte chunks.
55e303ae A	49	* 3. Touch in one source chunk ahead with DCBT. This is probably the
	50	* least important change, and probably only helps restart the
	51	* hardware stream at the start of each source page.
55e303ae	52	*/
91447636 A	53
	54	#define rs r13
	55	#define rd r14
	56	#define rc r15
	57	#define rx r16
	58
	59	#define c16 r3
	60	#define c32 r4
	61	#define c48 r5
	62	#define c64 r6
	63	#define c80 r7
	64	#define c96 r8
	65	#define c112 r9
	66	#define c256 r10
	67	#define c384 r11
	68	#define rv r12 // vrsave
55e303ae A	69
	70	// Offsets within the "red zone" (which is 224 bytes long):
	71
91447636 A	72	#define rzR3 -8
	73	#define rzR13 -16
	74	#define rzR14 -24
	75	#define rzR15 -32
	76	#define rzR16 -40
	77
	78	#define rzV20 -64
	79	#define rzV21 -80
	80	#define rzV22 -96
	81	#define rzV23 -112
55e303ae A	82
	83
	84	#include <sys/appleapiopts.h>
	85	#include <ppc/asm.h>
	86	#include <machine/cpu_capabilities.h>
	87	#include <machine/commpage.h>
	88
	89	.text
91447636 A	90	/*
	91	* WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
	92	* to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
	93	* simple transformations:
	94	* - all word compares are changed to doubleword
	95	* - all "srwi[.]" opcodes are changed to "srdi[.]"
	96	* Nothing else is done. For this to work, the following rules must be
	97	* carefully followed:
	98	* - do not use carry or overflow
	99	* - only use record mode if you are sure the results are mode-invariant
	100	* for example, all "andi." and almost all "rlwinm." are fine
	101	* - do not use "slwi", "slw", or "srw"
	102	* An imaginative programmer could break the porting model in other ways, but the above
	103	* are the most likely problem areas. It is perhaps surprising how well in practice
	104	* this simple method works.
	105	*/
55e303ae A	106
55e303ae A	107	// Entry point. This is a subroutine of bcopy(). When called:
91447636 A	108	// r0 = return address (also stored in caller's SF)
	109	// r4 = source ptr
	110	// r5 = length (at least several pages)
	111	// r12 = dest ptr
55e303ae	112	//
91447636 A	113	// We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles
91447636 A	114	// and r3 preserved.
55e303ae A	115
	116	.align 5
	117	bigcopy_970:
91447636 A	118	neg r2,r12 // is destination cache-line-aligned?
	119	std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy()
	120	std r13,rzR13(r1) // spill non-volatile regs we use to redzone
	121	std r14,rzR14(r1)
	122	std r15,rzR15(r1)
	123	andi. r2,r2,0x7F // #bytes to align
	124	std r16,rzR16(r1)
	125	mr rs,r4 // copy parameters into nonvolatile registers
	126	mr rd,r12
	127	mr rc,r5
	128	mr rx,r0 // also save return address
	129	beq 1f // skip if already aligned
55e303ae A	130
55e303ae A	131	// Cache-line-align destination.
91447636 A	132
	133	mr r3,rd // set up dest ptr for memcpy()
	134	mr r5,r2 // number of bytes to copy
	135	add rs,rs,r2 // then bump our parameters past initial copy
	136	add rd,rd,r2
	137	sub rc,rc,r2
	138	bla _COMM_PAGE_MEMCPY // 128-byte-align destination
55e303ae A	139
55e303ae A	140
91447636 A	141	// Load constant offsets and check whether source is 16-byte aligned.
	142	// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
	143	// and we dcbz only if cr7 beq is set.
55e303ae	144
91447636 A	145	1:
91447636 A	146	dcbt 0,rs // touch in 1st line of source
55e303ae A	147	andi. r0,rs,15 // check source alignment
55e303ae A	148	mfspr rv,vrsave // save caller's bitmask
55e303ae A	149	li c16,16 // load the constant offsets for x-form ops
55e303ae A	150	li c32,32
91447636 A	151	srwi r2,rc,8 // get number of 256-byte chunks to xfer
91447636 A	152	li r0,-256 // we use 24 VRs (ie, 0-23)
55e303ae	153	li c48,48
91447636 A	154	li c64,64
	155	li c80,80
	156	or r0,r0,rv // add our bits to caller's
	157	li c96,96
	158	mtctr r2 // set up loop count
	159	li c112,112
	160	cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128
	161	mtspr vrsave,r0 // say we use vr0..vr23
55e303ae A	162	li c256,256
55e303ae A	163	li c384,384
91447636	164	beq LalignedLoop // handle aligned sources
55e303ae	165
55e303ae	166
91447636	167	// Set up for unaligned loop.
55e303ae	168
55e303ae A	169	lvsl v0,0,rs // get permute vector for left shift
55e303ae A	170	lvxl v1,0,rs // prime the loop
91447636 A	171	li r0,rzV20 // save non-volatile VRs in redzone
	172	stvx v20,r1,r0
	173	li r0,rzV21
	174	stvx v21,r1,r0
	175	li r0,rzV22
	176	stvx v22,r1,r0
	177	li r0,rzV23
	178	stvx v23,r1,r0
55e303ae A	179	b LunalignedLoop // enter unaligned loop
	180
	181
91447636 A	182	// Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines).
91447636 A	183	// Destination is 128-byte aligned, source is unaligned.
55e303ae A	184
	185	.align 5
	186	LunalignedLoop:
91447636 A	187	dcbt c256,rs // touch in next chunk
	188	dcbt c384,rs
	189	addi r2,rs,128 // point to 2nd 128 bytes of source
55e303ae A	190	lvxl v2,c16,rs
55e303ae A	191	lvxl v3,c32,rs
91447636 A	192	lvxl v4,c48,rs
	193	lvxl v5,c64,rs
	194	lvxl v6,c80,rs
	195	lvxl v7,c96,rs
	196	lvxl v8,c112,rs
	197	lvxl v9,0,r2
	198	addi rs,rs,256 // point to next source chunk
	199	lvxl v10,c16,r2
	200	lvxl v11,c32,r2
	201	vperm v17,v1,v2,v0
	202	lvxl v12,c48,r2
	203	lvxl v13,c64,r2
	204	vperm v18,v2,v3,v0
	205	lvxl v14,c80,r2
	206	lvxl v15,c96,r2
	207	vperm v19,v3,v4,v0
	208	lvxl v16,c112,r2
	209	lvxl v1,0,rs // peek ahead at first source quad in next chunk
	210	vperm v20,v4,v5,v0
	211	addi r2,rd,128 // point to 2nd 128 bytes of dest
55e303ae	212	bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
91447636 A	213	dcbz128 0,rd
91447636 A	214	dcbz128 0,r2
55e303ae	215	1:
91447636 A	216	vperm v21,v5,v6,v0
	217	stvxl v17,0,rd
	218	vperm v22,v6,v7,v0
	219	stvxl v18,c16,rd
	220	vperm v23,v7,v8,v0
	221	stvxl v19,c32,rd
	222	vperm v17,v8,v9,v0
	223	stvxl v20,c48,rd
	224	vperm v18,v9,v10,v0
	225	stvxl v21,c64,rd
	226	vperm v19,v10,v11,v0
	227	stvxl v22,c80,rd
	228	vperm v20,v11,v12,v0
	229	stvxl v23,c96,rd
	230	vperm v21,v12,v13,v0
	231	stvxl v17,c112,rd
	232	vperm v22,v13,v14,v0
	233	addi rd,rd,256 // point to next dest chunk
	234	stvxl v18,0,r2
	235	vperm v23,v14,v15,v0
	236	stvxl v19,c16,r2
	237	vperm v17,v15,v16,v0
	238	stvxl v20,c32,r2
	239	vperm v18,v16,v1,v0
	240	stvxl v21,c48,r2
	241	stvxl v22,c64,r2
	242	stvxl v23,c80,r2
	243	stvxl v17,c96,r2
	244	stvxl v18,c112,r2
	245	bdnz++ LunalignedLoop // loop if another 256 bytes to go
	246
	247	li r6,rzV20 // restore non-volatile VRs
	248	li r7,rzV21
	249	li r8,rzV22
	250	li r9,rzV23
	251	lvx v20,r1,r6
	252	lvx v21,r1,r7
	253	lvx v22,r1,r8
	254	lvx v23,r1,r9
	255	b Ldone
55e303ae A	256
	257
	258	// Aligned loop. Destination is 128-byte aligned, and source is 16-byte
91447636	259	// aligned. Loop over 256-byte chunks (2 cache lines.)
55e303ae A	260
	261	.align 5
	262	LalignedLoop:
91447636 A	263	dcbt c256,rs // touch in next chunk
	264	dcbt c384,rs
	265	addi r2,rs,128 // point to 2nd 128 bytes of source
55e303ae A	266	lvxl v1,0,rs
55e303ae A	267	lvxl v2,c16,rs
55e303ae A	268	lvxl v3,c32,rs
55e303ae A	269	lvxl v4,c48,rs
91447636 A	270	lvxl v5,c64,rs
	271	lvxl v6,c80,rs
	272	lvxl v7,c96,rs
	273	lvxl v8,c112,rs
	274	lvxl v9,0,r2
	275	lvxl v10,c16,r2
	276	lvxl v11,c32,r2
	277	lvxl v12,c48,r2
	278	lvxl v13,c64,r2
	279	lvxl v14,c80,r2
	280	lvxl v15,c96,r2
	281	lvxl v16,c112,r2
	282	addi r2,rd,128 // point to 2nd 128 bytes of dest
	283	bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
	284	dcbz128 0,rd
	285	dcbz128 0,r2
	286	1:
	287	addi rs,rs,256 // point to next source chunk
55e303ae A	288	stvxl v1,0,rd
	289	stvxl v2,c16,rd
	290	stvxl v3,c32,rd
	291	stvxl v4,c48,rd
91447636 A	292	stvxl v5,c64,rd
	293	stvxl v6,c80,rd
	294	stvxl v7,c96,rd
	295	stvxl v8,c112,rd
	296	addi rd,rd,256 // point to next dest chunk
	297	stvxl v9,0,r2
	298	stvxl v10,c16,r2
	299	stvxl v11,c32,r2
	300	stvxl v12,c48,r2
	301	stvxl v13,c64,r2
	302	stvxl v14,c80,r2
	303	stvxl v15,c96,r2
	304	stvxl v16,c112,r2
	305	bdnz++ LalignedLoop // loop if another 256 bytes to go
	306
	307
	308	// Done, except for 0..255 leftover bytes at end.
55e303ae A	309	// rs = source ptr
55e303ae A	310	// rd = dest ptr
91447636	311	// rc = remaining count in low 7 bits
55e303ae	312	// rv = caller's vrsave
91447636	313	// rx = caller's return address
55e303ae A	314
55e303ae A	315	Ldone:
91447636 A	316	andi. r5,rc,0xFF // any leftover bytes? (0..255)
91447636 A	317	mtspr vrsave,rv // restore bitmap of live vr's
55e303ae	318
91447636 A	319	mr r3,rd
	320	mr r4,rs
	321	bnela _COMM_PAGE_MEMCPY // copy leftover bytes
	322
	323	mtlr rx // restore return address
	324	ld r3,rzR3(r1) // restore non-volatile GPRs from redzone
	325	ld r13,rzR13(r1)
	326	ld r14,rzR14(r1)
	327	ld r15,rzR15(r1)
	328	ld r16,rzR16(r1)
55e303ae A	329	blr
	330
	331
91447636	332	COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
55e303ae	333