[apple/xnu.git] / osfmk / ppc / commpage / bigcopy_970.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/* ====================================
 * Very Long Operand BCOPY for Mac OS X
 * ====================================
 *
 * Version of 2/21/2004, tuned for the IBM 970.  This is for operands at
 * least several pages long.  It is called from bcopy()/memcpy()/memmove(),
 * and runs both in 32 and 64-bit mode.
 *
 * We use the following additional strategies not used by the shorter
 * operand paths.  Mostly, we try to optimize for memory bandwidth:
 *	1. Use DCBZ128 to avoid reading destination lines.  Because this code
 *     resides on the commmpage, it can use a private interface with the
 *     kernel to minimize alignment exceptions if the destination is
 *     uncached.  The kernel will clear cr7 whenever it emulates a DCBZ or
 *     DCBZ128 on the commpage.  Thus we take at most one exception per call,
 *     which is amortized across the very long operand.
 *	2. Copy larger chunks per iteration to minimize R/W bus turnaround
 *     and maximize DRAM page locality (opening a new page is expensive.)
 *     We use 256-byte chunks.
 *  3. Touch in one source chunk ahead with DCBT.  This is probably the
 *     least important change, and probably only helps restart the
 *     hardware stream at the start of each source page.
 */
 
#define rs	r13
#define rd	r14
#define rc	r15
#define rx  r16

#define c16     r3
#define c32     r4
#define c48     r5
#define c64     r6
#define c80     r7
#define c96     r8
#define c112    r9
#define	c256	r10
#define	c384	r11
#define rv      r12     // vrsave

// Offsets within the "red zone" (which is 224 bytes long):

#define rzR3    -8
#define rzR13	-16
#define rzR14	-24
#define rzR15   -32
#define rzR16   -40

#define rzV20	-64
#define rzV21	-80
#define rzV22	-96
#define rzV23	-112


#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
/*
 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
 * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
 * simple transformations:
 *      - all word compares are changed to doubleword
 *      - all "srwi[.]" opcodes are changed to "srdi[.]"                      
 * Nothing else is done.  For this to work, the following rules must be
 * carefully followed:
 *      - do not use carry or overflow
 *      - only use record mode if you are sure the results are mode-invariant
 *        for example, all "andi." and almost all "rlwinm." are fine
 *      - do not use "slwi", "slw", or "srw"
 * An imaginative programmer could break the porting model in other ways, but the above
 * are the most likely problem areas.  It is perhaps surprising how well in practice
 * this simple method works.
 */

// Entry point.  This is a subroutine of bcopy().  When called:
//  r0 = return address (also stored in caller's SF)
//	r4 = source ptr
//	r5 = length (at least several pages)
// r12 = dest ptr
// 
// We only do "forward" moves, ie non-overlapping or toward 0.  We return with non-volatiles
// and r3 preserved.

        .align 	5
bigcopy_970:
        neg     r2,r12              // is destination cache-line-aligned?
        std     r3,rzR3(r1)         // save caller's r3, which must be preserved for memcpy()
        std		r13,rzR13(r1)		// spill non-volatile regs we use to redzone
        std		r14,rzR14(r1)
        std		r15,rzR15(r1)
        andi.   r2,r2,0x7F          // #bytes to align
        std     r16,rzR16(r1)
        mr      rs,r4               // copy parameters into nonvolatile registers
        mr      rd,r12
        mr      rc,r5
        mr      rx,r0               // also save return address
        beq     1f                  // skip if already aligned

// Cache-line-align destination.
        
        mr      r3,rd               // set up dest ptr for memcpy()
        mr      r5,r2               // number of bytes to copy
        add     rs,rs,r2            // then bump our parameters past initial copy
        add     rd,rd,r2
        sub     rc,rc,r2
        bla     _COMM_PAGE_MEMCPY   // 128-byte-align destination


// Load constant offsets and check whether source is 16-byte aligned.
// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
// and we dcbz only if cr7 beq is set.

1:
        dcbt    0,rs                // touch in 1st line of source
        andi.	r0,rs,15			// check source alignment
        mfspr	rv,vrsave			// save caller's bitmask
        li		c16,16				// load the constant offsets for x-form ops
        li		c32,32
        srwi    r2,rc,8             // get number of 256-byte chunks to xfer
        li		r0,-256				// we use 24 VRs (ie, 0-23)
        li		c48,48
        li      c64,64
        li      c80,80
        or      r0,r0,rv            // add our bits to caller's
        li      c96,96
        mtctr   r2                  // set up loop count
        li      c112,112
        cmpd    cr7,r2,r2           // initialize cr7_eq to "on", so we dcbz128
        mtspr	vrsave,r0           // say we use vr0..vr23
        li		c256,256
        li		c384,384
        beq		LalignedLoop		// handle aligned sources

        
// Set up for unaligned loop.

        lvsl	v0,0,rs				// get permute vector for left shift
        lvxl	v1,0,rs				// prime the loop
        li		r0,rzV20            // save non-volatile VRs in redzone
        stvx	v20,r1,r0
        li		r0,rzV21
        stvx	v21,r1,r0
        li		r0,rzV22
        stvx	v22,r1,r0
        li		r0,rzV23
        stvx	v23,r1,r0
        b		LunalignedLoop		// enter unaligned loop


// Main loop for unaligned operands.  We loop over 256-byte chunks (2 cache lines).
// Destination is 128-byte aligned, source is unaligned.

        .align	5
LunalignedLoop:
        dcbt	c256,rs             // touch in next chunk
        dcbt	c384,rs
        addi    r2,rs,128           // point to 2nd 128 bytes of source
        lvxl	v2,c16,rs
        lvxl	v3,c32,rs
        lvxl	v4,c48,rs
        lvxl    v5,c64,rs
        lvxl    v6,c80,rs
        lvxl    v7,c96,rs
        lvxl    v8,c112,rs
        lvxl    v9,0,r2
        addi    rs,rs,256           // point to next source chunk
        lvxl    v10,c16,r2
        lvxl    v11,c32,r2
        vperm   v17,v1,v2,v0
        lvxl    v12,c48,r2
        lvxl    v13,c64,r2
        vperm   v18,v2,v3,v0
        lvxl    v14,c80,r2
        lvxl    v15,c96,r2
        vperm   v19,v3,v4,v0
        lvxl    v16,c112,r2
        lvxl	v1,0,rs             // peek ahead at first source quad in next chunk
        vperm   v20,v4,v5,v0
        addi    r2,rd,128           // point to 2nd 128 bytes of dest 
        bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
        dcbz128	0,rd
        dcbz128	0,r2
1:
        vperm   v21,v5,v6,v0
        stvxl	v17,0,rd
        vperm   v22,v6,v7,v0
        stvxl	v18,c16,rd
        vperm   v23,v7,v8,v0
        stvxl	v19,c32,rd
        vperm   v17,v8,v9,v0
        stvxl	v20,c48,rd
        vperm   v18,v9,v10,v0
        stvxl	v21,c64,rd
        vperm   v19,v10,v11,v0
        stvxl	v22,c80,rd
        vperm   v20,v11,v12,v0
        stvxl	v23,c96,rd
        vperm   v21,v12,v13,v0
        stvxl	v17,c112,rd
        vperm   v22,v13,v14,v0
        addi	rd,rd,256           // point to next dest chunk
        stvxl	v18,0,r2
        vperm   v23,v14,v15,v0
        stvxl	v19,c16,r2
        vperm   v17,v15,v16,v0
        stvxl	v20,c32,r2
        vperm   v18,v16,v1,v0
        stvxl	v21,c48,r2
        stvxl	v22,c64,r2
        stvxl	v23,c80,r2
        stvxl	v17,c96,r2
        stvxl	v18,c112,r2
        bdnz++	LunalignedLoop      // loop if another 256 bytes to go

        li		r6,rzV20            // restore non-volatile VRs
        li		r7,rzV21
        li		r8,rzV22
        li		r9,rzV23
        lvx		v20,r1,r6
        lvx		v21,r1,r7
        lvx		v22,r1,r8
        lvx		v23,r1,r9
        b       Ldone
        
        
// Aligned loop.  Destination is 128-byte aligned, and source is 16-byte
// aligned.  Loop over 256-byte chunks (2 cache lines.)

        .align	5
LalignedLoop:
        dcbt	c256,rs             // touch in next chunk
        dcbt	c384,rs
        addi    r2,rs,128           // point to 2nd 128 bytes of source
        lvxl	v1,0,rs
        lvxl	v2,c16,rs
        lvxl	v3,c32,rs
        lvxl	v4,c48,rs
        lvxl    v5,c64,rs
        lvxl    v6,c80,rs
        lvxl    v7,c96,rs
        lvxl    v8,c112,rs
        lvxl    v9,0,r2
        lvxl    v10,c16,r2
        lvxl    v11,c32,r2
        lvxl    v12,c48,r2
        lvxl    v13,c64,r2
        lvxl    v14,c80,r2
        lvxl    v15,c96,r2
        lvxl    v16,c112,r2
        addi    r2,rd,128           // point to 2nd 128 bytes of dest 
        bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
        dcbz128	0,rd
        dcbz128	0,r2
1:
        addi    rs,rs,256           // point to next source chunk
        stvxl	v1,0,rd
        stvxl	v2,c16,rd
        stvxl	v3,c32,rd
        stvxl	v4,c48,rd
        stvxl	v5,c64,rd
        stvxl	v6,c80,rd
        stvxl	v7,c96,rd
        stvxl	v8,c112,rd
        addi	rd,rd,256           // point to next dest chunk
        stvxl	v9,0,r2
        stvxl	v10,c16,r2
        stvxl	v11,c32,r2
        stvxl	v12,c48,r2
        stvxl	v13,c64,r2
        stvxl	v14,c80,r2
        stvxl	v15,c96,r2
        stvxl	v16,c112,r2
        bdnz++	LalignedLoop		// loop if another 256 bytes to go


// Done, except for 0..255 leftover bytes at end.
//	rs = source ptr
//	rd = dest ptr
//	rc = remaining count in low 7 bits
//	rv = caller's vrsave
//  rx = caller's return address

Ldone:
        andi.   r5,rc,0xFF          // any leftover bytes? (0..255)
        mtspr	vrsave,rv			// restore bitmap of live vr's
        
        mr      r3,rd
        mr      r4,rs
        bnela   _COMM_PAGE_MEMCPY   // copy leftover bytes

        mtlr    rx                  // restore return address
        ld      r3,rzR3(r1)         // restore non-volatile GPRs from redzone
        ld		r13,rzR13(r1)
        ld		r14,rzR14(r1)
        ld		r15,rzR15(r1)
        ld      r16,rzR16(r1)
        blr


        COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
2d21ac55	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
55e303ae	5	*
2d21ac55 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
8f6c56a5	14	*
2d21ac55 A	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55 A	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
8f6c56a5	25	*
2d21ac55	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
55e303ae A	27	*/
	28	/* ====================================
	29	* Very Long Operand BCOPY for Mac OS X
	30	* ====================================
	31	*
91447636 A	32	* Version of 2/21/2004, tuned for the IBM 970. This is for operands at
	33	* least several pages long. It is called from bcopy()/memcpy()/memmove(),
	34	* and runs both in 32 and 64-bit mode.
55e303ae A	35	*
	36	* We use the following additional strategies not used by the shorter
	37	* operand paths. Mostly, we try to optimize for memory bandwidth:
	38	* 1. Use DCBZ128 to avoid reading destination lines. Because this code
	39	* resides on the commmpage, it can use a private interface with the
	40	* kernel to minimize alignment exceptions if the destination is
	41	* uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
	42	* DCBZ128 on the commpage. Thus we take at most one exception per call,
	43	* which is amortized across the very long operand.
	44	* 2. Copy larger chunks per iteration to minimize R/W bus turnaround
	45	* and maximize DRAM page locality (opening a new page is expensive.)
91447636	46	* We use 256-byte chunks.
55e303ae A	47	* 3. Touch in one source chunk ahead with DCBT. This is probably the
	48	* least important change, and probably only helps restart the
	49	* hardware stream at the start of each source page.
55e303ae	50	*/
91447636 A	51
	52	#define rs r13
	53	#define rd r14
	54	#define rc r15
	55	#define rx r16
	56
	57	#define c16 r3
	58	#define c32 r4
	59	#define c48 r5
	60	#define c64 r6
	61	#define c80 r7
	62	#define c96 r8
	63	#define c112 r9
	64	#define c256 r10
	65	#define c384 r11
	66	#define rv r12 // vrsave
55e303ae A	67
	68	// Offsets within the "red zone" (which is 224 bytes long):
	69
91447636 A	70	#define rzR3 -8
	71	#define rzR13 -16
	72	#define rzR14 -24
	73	#define rzR15 -32
	74	#define rzR16 -40
	75
	76	#define rzV20 -64
	77	#define rzV21 -80
	78	#define rzV22 -96
	79	#define rzV23 -112
55e303ae A	80
	81
	82	#include <sys/appleapiopts.h>
	83	#include <ppc/asm.h>
	84	#include <machine/cpu_capabilities.h>
	85	#include <machine/commpage.h>
	86
	87	.text
91447636 A	88	/*
	89	* WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
	90	* to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
	91	* simple transformations:
	92	* - all word compares are changed to doubleword
	93	* - all "srwi[.]" opcodes are changed to "srdi[.]"
	94	* Nothing else is done. For this to work, the following rules must be
	95	* carefully followed:
	96	* - do not use carry or overflow
	97	* - only use record mode if you are sure the results are mode-invariant
	98	* for example, all "andi." and almost all "rlwinm." are fine
	99	* - do not use "slwi", "slw", or "srw"
	100	* An imaginative programmer could break the porting model in other ways, but the above
	101	* are the most likely problem areas. It is perhaps surprising how well in practice
	102	* this simple method works.
	103	*/
55e303ae A	104
55e303ae A	105	// Entry point. This is a subroutine of bcopy(). When called:
91447636 A	106	// r0 = return address (also stored in caller's SF)
	107	// r4 = source ptr
	108	// r5 = length (at least several pages)
	109	// r12 = dest ptr
55e303ae	110	//
91447636 A	111	// We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles
91447636 A	112	// and r3 preserved.
55e303ae A	113
	114	.align 5
	115	bigcopy_970:
91447636 A	116	neg r2,r12 // is destination cache-line-aligned?
	117	std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy()
	118	std r13,rzR13(r1) // spill non-volatile regs we use to redzone
	119	std r14,rzR14(r1)
	120	std r15,rzR15(r1)
	121	andi. r2,r2,0x7F // #bytes to align
	122	std r16,rzR16(r1)
	123	mr rs,r4 // copy parameters into nonvolatile registers
	124	mr rd,r12
	125	mr rc,r5
	126	mr rx,r0 // also save return address
	127	beq 1f // skip if already aligned
55e303ae A	128
55e303ae A	129	// Cache-line-align destination.
91447636 A	130
	131	mr r3,rd // set up dest ptr for memcpy()
	132	mr r5,r2 // number of bytes to copy
	133	add rs,rs,r2 // then bump our parameters past initial copy
	134	add rd,rd,r2
	135	sub rc,rc,r2
	136	bla _COMM_PAGE_MEMCPY // 128-byte-align destination
55e303ae A	137
55e303ae A	138
91447636 A	139	// Load constant offsets and check whether source is 16-byte aligned.
	140	// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
	141	// and we dcbz only if cr7 beq is set.
55e303ae	142
91447636 A	143	1:
91447636 A	144	dcbt 0,rs // touch in 1st line of source
55e303ae A	145	andi. r0,rs,15 // check source alignment
55e303ae A	146	mfspr rv,vrsave // save caller's bitmask
55e303ae A	147	li c16,16 // load the constant offsets for x-form ops
55e303ae A	148	li c32,32
91447636 A	149	srwi r2,rc,8 // get number of 256-byte chunks to xfer
91447636 A	150	li r0,-256 // we use 24 VRs (ie, 0-23)
55e303ae	151	li c48,48
91447636 A	152	li c64,64
	153	li c80,80
	154	or r0,r0,rv // add our bits to caller's
	155	li c96,96
	156	mtctr r2 // set up loop count
	157	li c112,112
	158	cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128
	159	mtspr vrsave,r0 // say we use vr0..vr23
55e303ae A	160	li c256,256
55e303ae A	161	li c384,384
91447636	162	beq LalignedLoop // handle aligned sources
55e303ae	163
55e303ae	164
91447636	165	// Set up for unaligned loop.
55e303ae	166
55e303ae A	167	lvsl v0,0,rs // get permute vector for left shift
55e303ae A	168	lvxl v1,0,rs // prime the loop
91447636 A	169	li r0,rzV20 // save non-volatile VRs in redzone
	170	stvx v20,r1,r0
	171	li r0,rzV21
	172	stvx v21,r1,r0
	173	li r0,rzV22
	174	stvx v22,r1,r0
	175	li r0,rzV23
	176	stvx v23,r1,r0
55e303ae A	177	b LunalignedLoop // enter unaligned loop
	178
	179
91447636 A	180	// Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines).
91447636 A	181	// Destination is 128-byte aligned, source is unaligned.
55e303ae A	182
	183	.align 5
	184	LunalignedLoop:
91447636 A	185	dcbt c256,rs // touch in next chunk
	186	dcbt c384,rs
	187	addi r2,rs,128 // point to 2nd 128 bytes of source
55e303ae A	188	lvxl v2,c16,rs
55e303ae A	189	lvxl v3,c32,rs
91447636 A	190	lvxl v4,c48,rs
	191	lvxl v5,c64,rs
	192	lvxl v6,c80,rs
	193	lvxl v7,c96,rs
	194	lvxl v8,c112,rs
	195	lvxl v9,0,r2
	196	addi rs,rs,256 // point to next source chunk
	197	lvxl v10,c16,r2
	198	lvxl v11,c32,r2
	199	vperm v17,v1,v2,v0
	200	lvxl v12,c48,r2
	201	lvxl v13,c64,r2
	202	vperm v18,v2,v3,v0
	203	lvxl v14,c80,r2
	204	lvxl v15,c96,r2
	205	vperm v19,v3,v4,v0
	206	lvxl v16,c112,r2
	207	lvxl v1,0,rs // peek ahead at first source quad in next chunk
	208	vperm v20,v4,v5,v0
	209	addi r2,rd,128 // point to 2nd 128 bytes of dest
55e303ae	210	bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
91447636 A	211	dcbz128 0,rd
91447636 A	212	dcbz128 0,r2
55e303ae	213	1:
91447636 A	214	vperm v21,v5,v6,v0
	215	stvxl v17,0,rd
	216	vperm v22,v6,v7,v0
	217	stvxl v18,c16,rd
	218	vperm v23,v7,v8,v0
	219	stvxl v19,c32,rd
	220	vperm v17,v8,v9,v0
	221	stvxl v20,c48,rd
	222	vperm v18,v9,v10,v0
	223	stvxl v21,c64,rd
	224	vperm v19,v10,v11,v0
	225	stvxl v22,c80,rd
	226	vperm v20,v11,v12,v0
	227	stvxl v23,c96,rd
	228	vperm v21,v12,v13,v0
	229	stvxl v17,c112,rd
	230	vperm v22,v13,v14,v0
	231	addi rd,rd,256 // point to next dest chunk
	232	stvxl v18,0,r2
	233	vperm v23,v14,v15,v0
	234	stvxl v19,c16,r2
	235	vperm v17,v15,v16,v0
	236	stvxl v20,c32,r2
	237	vperm v18,v16,v1,v0
	238	stvxl v21,c48,r2
	239	stvxl v22,c64,r2
	240	stvxl v23,c80,r2
	241	stvxl v17,c96,r2
	242	stvxl v18,c112,r2
	243	bdnz++ LunalignedLoop // loop if another 256 bytes to go
	244
	245	li r6,rzV20 // restore non-volatile VRs
	246	li r7,rzV21
	247	li r8,rzV22
	248	li r9,rzV23
	249	lvx v20,r1,r6
	250	lvx v21,r1,r7
	251	lvx v22,r1,r8
	252	lvx v23,r1,r9
	253	b Ldone
55e303ae A	254
	255
	256	// Aligned loop. Destination is 128-byte aligned, and source is 16-byte
91447636	257	// aligned. Loop over 256-byte chunks (2 cache lines.)
55e303ae A	258
	259	.align 5
	260	LalignedLoop:
91447636 A	261	dcbt c256,rs // touch in next chunk
	262	dcbt c384,rs
	263	addi r2,rs,128 // point to 2nd 128 bytes of source
55e303ae A	264	lvxl v1,0,rs
55e303ae A	265	lvxl v2,c16,rs
55e303ae A	266	lvxl v3,c32,rs
55e303ae A	267	lvxl v4,c48,rs
91447636 A	268	lvxl v5,c64,rs
	269	lvxl v6,c80,rs
	270	lvxl v7,c96,rs
	271	lvxl v8,c112,rs
	272	lvxl v9,0,r2
	273	lvxl v10,c16,r2
	274	lvxl v11,c32,r2
	275	lvxl v12,c48,r2
	276	lvxl v13,c64,r2
	277	lvxl v14,c80,r2
	278	lvxl v15,c96,r2
	279	lvxl v16,c112,r2
	280	addi r2,rd,128 // point to 2nd 128 bytes of dest
	281	bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
	282	dcbz128 0,rd
	283	dcbz128 0,r2
	284	1:
	285	addi rs,rs,256 // point to next source chunk
55e303ae A	286	stvxl v1,0,rd
	287	stvxl v2,c16,rd
	288	stvxl v3,c32,rd
	289	stvxl v4,c48,rd
91447636 A	290	stvxl v5,c64,rd
	291	stvxl v6,c80,rd
	292	stvxl v7,c96,rd
	293	stvxl v8,c112,rd
	294	addi rd,rd,256 // point to next dest chunk
	295	stvxl v9,0,r2
	296	stvxl v10,c16,r2
	297	stvxl v11,c32,r2
	298	stvxl v12,c48,r2
	299	stvxl v13,c64,r2
	300	stvxl v14,c80,r2
	301	stvxl v15,c96,r2
	302	stvxl v16,c112,r2
	303	bdnz++ LalignedLoop // loop if another 256 bytes to go
	304
	305
	306	// Done, except for 0..255 leftover bytes at end.
55e303ae A	307	// rs = source ptr
55e303ae A	308	// rd = dest ptr
91447636	309	// rc = remaining count in low 7 bits
55e303ae	310	// rv = caller's vrsave
91447636	311	// rx = caller's return address
55e303ae A	312
55e303ae A	313	Ldone:
91447636 A	314	andi. r5,rc,0xFF // any leftover bytes? (0..255)
91447636 A	315	mtspr vrsave,rv // restore bitmap of live vr's
55e303ae	316
91447636 A	317	mr r3,rd
	318	mr r4,rs
	319	bnela _COMM_PAGE_MEMCPY // copy leftover bytes
	320
	321	mtlr rx // restore return address
	322	ld r3,rzR3(r1) // restore non-volatile GPRs from redzone
	323	ld r13,rzR13(r1)
	324	ld r14,rzR14(r1)
	325	ld r15,rzR15(r1)
	326	ld r16,rzR16(r1)
55e303ae A	327	blr
	328
	329
91447636	330	COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
55e303ae	331