[apple/xnu.git] / osfmk / ppc / commpage / bigcopy_970.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/* ====================================
 * Very Long Operand BCOPY for Mac OS X
 * ====================================
 *
 * Version of 2/21/2004, tuned for the IBM 970.  This is for operands at
 * least several pages long.  It is called from bcopy()/memcpy()/memmove(),
 * and runs both in 32 and 64-bit mode.
 *
 * We use the following additional strategies not used by the shorter
 * operand paths.  Mostly, we try to optimize for memory bandwidth:
 *	1. Use DCBZ128 to avoid reading destination lines.  Because this code
 *     resides on the commmpage, it can use a private interface with the
 *     kernel to minimize alignment exceptions if the destination is
 *     uncached.  The kernel will clear cr7 whenever it emulates a DCBZ or
 *     DCBZ128 on the commpage.  Thus we take at most one exception per call,
 *     which is amortized across the very long operand.
 *	2. Copy larger chunks per iteration to minimize R/W bus turnaround
 *     and maximize DRAM page locality (opening a new page is expensive.)
 *     We use 256-byte chunks.
 *  3. Touch in one source chunk ahead with DCBT.  This is probably the
 *     least important change, and probably only helps restart the
 *     hardware stream at the start of each source page.
 */
 
#define rs	r13
#define rd	r14
#define rc	r15
#define rx  r16

#define c16     r3
#define c32     r4
#define c48     r5
#define c64     r6
#define c80     r7
#define c96     r8
#define c112    r9
#define	c256	r10
#define	c384	r11
#define rv      r12     // vrsave

// Offsets within the "red zone" (which is 224 bytes long):

#define rzR3    -8
#define rzR13	-16
#define rzR14	-24
#define rzR15   -32
#define rzR16   -40

#define rzV20	-64
#define rzV21	-80
#define rzV22	-96
#define rzV23	-112


#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
/*
 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
 * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
 * simple transformations:
 *      - all word compares are changed to doubleword
 *      - all "srwi[.]" opcodes are changed to "srdi[.]"                      
 * Nothing else is done.  For this to work, the following rules must be
 * carefully followed:
 *      - do not use carry or overflow
 *      - only use record mode if you are sure the results are mode-invariant
 *        for example, all "andi." and almost all "rlwinm." are fine
 *      - do not use "slwi", "slw", or "srw"
 * An imaginative programmer could break the porting model in other ways, but the above
 * are the most likely problem areas.  It is perhaps surprising how well in practice
 * this simple method works.
 */

// Entry point.  This is a subroutine of bcopy().  When called:
//  r0 = return address (also stored in caller's SF)
//	r4 = source ptr
//	r5 = length (at least several pages)
// r12 = dest ptr
// 
// We only do "forward" moves, ie non-overlapping or toward 0.  We return with non-volatiles
// and r3 preserved.

        .align 	5
bigcopy_970:
        neg     r2,r12              // is destination cache-line-aligned?
        std     r3,rzR3(r1)         // save caller's r3, which must be preserved for memcpy()
        std		r13,rzR13(r1)		// spill non-volatile regs we use to redzone
        std		r14,rzR14(r1)
        std		r15,rzR15(r1)
        andi.   r2,r2,0x7F          // #bytes to align
        std     r16,rzR16(r1)
        mr      rs,r4               // copy parameters into nonvolatile registers
        mr      rd,r12
        mr      rc,r5
        mr      rx,r0               // also save return address
        beq     1f                  // skip if already aligned

// Cache-line-align destination.
        
        mr      r3,rd               // set up dest ptr for memcpy()
        mr      r5,r2               // number of bytes to copy
        add     rs,rs,r2            // then bump our parameters past initial copy
        add     rd,rd,r2
        sub     rc,rc,r2
        bla     _COMM_PAGE_MEMCPY   // 128-byte-align destination


// Load constant offsets and check whether source is 16-byte aligned.
// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
// and we dcbz only if cr7 beq is set.

1:
        dcbt    0,rs                // touch in 1st line of source
        andi.	r0,rs,15			// check source alignment
        mfspr	rv,vrsave			// save caller's bitmask
        li		c16,16				// load the constant offsets for x-form ops
        li		c32,32
        srwi    r2,rc,8             // get number of 256-byte chunks to xfer
        li		r0,-256				// we use 24 VRs (ie, 0-23)
        li		c48,48
        li      c64,64
        li      c80,80
        or      r0,r0,rv            // add our bits to caller's
        li      c96,96
        mtctr   r2                  // set up loop count
        li      c112,112
        cmpd    cr7,r2,r2           // initialize cr7_eq to "on", so we dcbz128
        mtspr	vrsave,r0           // say we use vr0..vr23
        li		c256,256
        li		c384,384
        beq		LalignedLoop		// handle aligned sources

        
// Set up for unaligned loop.

        lvsl	v0,0,rs				// get permute vector for left shift
        lvxl	v1,0,rs				// prime the loop
        li		r0,rzV20            // save non-volatile VRs in redzone
        stvx	v20,r1,r0
        li		r0,rzV21
        stvx	v21,r1,r0
        li		r0,rzV22
        stvx	v22,r1,r0
        li		r0,rzV23
        stvx	v23,r1,r0
        b		LunalignedLoop		// enter unaligned loop


// Main loop for unaligned operands.  We loop over 256-byte chunks (2 cache lines).
// Destination is 128-byte aligned, source is unaligned.

        .align	5
LunalignedLoop:
        dcbt	c256,rs             // touch in next chunk
        dcbt	c384,rs
        addi    r2,rs,128           // point to 2nd 128 bytes of source
        lvxl	v2,c16,rs
        lvxl	v3,c32,rs
        lvxl	v4,c48,rs
        lvxl    v5,c64,rs
        lvxl    v6,c80,rs
        lvxl    v7,c96,rs
        lvxl    v8,c112,rs
        lvxl    v9,0,r2
        addi    rs,rs,256           // point to next source chunk
        lvxl    v10,c16,r2
        lvxl    v11,c32,r2
        vperm   v17,v1,v2,v0
        lvxl    v12,c48,r2
        lvxl    v13,c64,r2
        vperm   v18,v2,v3,v0
        lvxl    v14,c80,r2
        lvxl    v15,c96,r2
        vperm   v19,v3,v4,v0
        lvxl    v16,c112,r2
        lvxl	v1,0,rs             // peek ahead at first source quad in next chunk
        vperm   v20,v4,v5,v0
        addi    r2,rd,128           // point to 2nd 128 bytes of dest 
        bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
        dcbz128	0,rd
        dcbz128	0,r2
1:
        vperm   v21,v5,v6,v0
        stvxl	v17,0,rd
        vperm   v22,v6,v7,v0
        stvxl	v18,c16,rd
        vperm   v23,v7,v8,v0
        stvxl	v19,c32,rd
        vperm   v17,v8,v9,v0
        stvxl	v20,c48,rd
        vperm   v18,v9,v10,v0
        stvxl	v21,c64,rd
        vperm   v19,v10,v11,v0
        stvxl	v22,c80,rd
        vperm   v20,v11,v12,v0
        stvxl	v23,c96,rd
        vperm   v21,v12,v13,v0
        stvxl	v17,c112,rd
        vperm   v22,v13,v14,v0
        addi	rd,rd,256           // point to next dest chunk
        stvxl	v18,0,r2
        vperm   v23,v14,v15,v0
        stvxl	v19,c16,r2
        vperm   v17,v15,v16,v0
        stvxl	v20,c32,r2
        vperm   v18,v16,v1,v0
        stvxl	v21,c48,r2
        stvxl	v22,c64,r2
        stvxl	v23,c80,r2
        stvxl	v17,c96,r2
        stvxl	v18,c112,r2
        bdnz++	LunalignedLoop      // loop if another 256 bytes to go

        li		r6,rzV20            // restore non-volatile VRs
        li		r7,rzV21
        li		r8,rzV22
        li		r9,rzV23
        lvx		v20,r1,r6
        lvx		v21,r1,r7
        lvx		v22,r1,r8
        lvx		v23,r1,r9
        b       Ldone
        
        
// Aligned loop.  Destination is 128-byte aligned, and source is 16-byte
// aligned.  Loop over 256-byte chunks (2 cache lines.)

        .align	5
LalignedLoop:
        dcbt	c256,rs             // touch in next chunk
        dcbt	c384,rs
        addi    r2,rs,128           // point to 2nd 128 bytes of source
        lvxl	v1,0,rs
        lvxl	v2,c16,rs
        lvxl	v3,c32,rs
        lvxl	v4,c48,rs
        lvxl    v5,c64,rs
        lvxl    v6,c80,rs
        lvxl    v7,c96,rs
        lvxl    v8,c112,rs
        lvxl    v9,0,r2
        lvxl    v10,c16,r2
        lvxl    v11,c32,r2
        lvxl    v12,c48,r2
        lvxl    v13,c64,r2
        lvxl    v14,c80,r2
        lvxl    v15,c96,r2
        lvxl    v16,c112,r2
        addi    r2,rd,128           // point to 2nd 128 bytes of dest 
        bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
        dcbz128	0,rd
        dcbz128	0,r2
1:
        addi    rs,rs,256           // point to next source chunk
        stvxl	v1,0,rd
        stvxl	v2,c16,rd
        stvxl	v3,c32,rd
        stvxl	v4,c48,rd
        stvxl	v5,c64,rd
        stvxl	v6,c80,rd
        stvxl	v7,c96,rd
        stvxl	v8,c112,rd
        addi	rd,rd,256           // point to next dest chunk
        stvxl	v9,0,r2
        stvxl	v10,c16,r2
        stvxl	v11,c32,r2
        stvxl	v12,c48,r2
        stvxl	v13,c64,r2
        stvxl	v14,c80,r2
        stvxl	v15,c96,r2
        stvxl	v16,c112,r2
        bdnz++	LalignedLoop		// loop if another 256 bytes to go


// Done, except for 0..255 leftover bytes at end.
//	rs = source ptr
//	rd = dest ptr
//	rc = remaining count in low 7 bits
//	rv = caller's vrsave
//  rx = caller's return address

Ldone:
        andi.   r5,rc,0xFF          // any leftover bytes? (0..255)
        mtspr	vrsave,rv			// restore bitmap of live vr's
        
        mr      r3,rd
        mr      r4,rs
        bnela   _COMM_PAGE_MEMCPY   // copy leftover bytes

        mtlr    rx                  // restore return address
        ld      r3,rzR3(r1)         // restore non-volatile GPRs from redzone
        ld		r13,rzR13(r1)
        ld		r14,rzR14(r1)
        ld		r15,rzR15(r1)
        ld      r16,rzR16(r1)
        blr


        COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
6601e61a	4	* @APPLE_LICENSE_HEADER_START@
55e303ae	5	*
6601e61a A	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
8f6c56a5	11	*
6601e61a A	12	* This Original Code and all software distributed under the License are
6601e61a A	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
6601e61a A	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
8f6c56a5	19	*
6601e61a	20	* @APPLE_LICENSE_HEADER_END@
55e303ae A	21	*/
	22	/* ====================================
	23	* Very Long Operand BCOPY for Mac OS X
	24	* ====================================
	25	*
91447636 A	26	* Version of 2/21/2004, tuned for the IBM 970. This is for operands at
	27	* least several pages long. It is called from bcopy()/memcpy()/memmove(),
	28	* and runs both in 32 and 64-bit mode.
55e303ae A	29	*
	30	* We use the following additional strategies not used by the shorter
	31	* operand paths. Mostly, we try to optimize for memory bandwidth:
	32	* 1. Use DCBZ128 to avoid reading destination lines. Because this code
	33	* resides on the commmpage, it can use a private interface with the
	34	* kernel to minimize alignment exceptions if the destination is
	35	* uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
	36	* DCBZ128 on the commpage. Thus we take at most one exception per call,
	37	* which is amortized across the very long operand.
	38	* 2. Copy larger chunks per iteration to minimize R/W bus turnaround
	39	* and maximize DRAM page locality (opening a new page is expensive.)
91447636	40	* We use 256-byte chunks.
55e303ae A	41	* 3. Touch in one source chunk ahead with DCBT. This is probably the
	42	* least important change, and probably only helps restart the
	43	* hardware stream at the start of each source page.
55e303ae	44	*/
91447636 A	45
	46	#define rs r13
	47	#define rd r14
	48	#define rc r15
	49	#define rx r16
	50
	51	#define c16 r3
	52	#define c32 r4
	53	#define c48 r5
	54	#define c64 r6
	55	#define c80 r7
	56	#define c96 r8
	57	#define c112 r9
	58	#define c256 r10
	59	#define c384 r11
	60	#define rv r12 // vrsave
55e303ae A	61
	62	// Offsets within the "red zone" (which is 224 bytes long):
	63
91447636 A	64	#define rzR3 -8
	65	#define rzR13 -16
	66	#define rzR14 -24
	67	#define rzR15 -32
	68	#define rzR16 -40
	69
	70	#define rzV20 -64
	71	#define rzV21 -80
	72	#define rzV22 -96
	73	#define rzV23 -112
55e303ae A	74
	75
	76	#include <sys/appleapiopts.h>
	77	#include <ppc/asm.h>
	78	#include <machine/cpu_capabilities.h>
	79	#include <machine/commpage.h>
	80
	81	.text
91447636 A	82	/*
	83	* WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
	84	* to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
	85	* simple transformations:
	86	* - all word compares are changed to doubleword
	87	* - all "srwi[.]" opcodes are changed to "srdi[.]"
	88	* Nothing else is done. For this to work, the following rules must be
	89	* carefully followed:
	90	* - do not use carry or overflow
	91	* - only use record mode if you are sure the results are mode-invariant
	92	* for example, all "andi." and almost all "rlwinm." are fine
	93	* - do not use "slwi", "slw", or "srw"
	94	* An imaginative programmer could break the porting model in other ways, but the above
	95	* are the most likely problem areas. It is perhaps surprising how well in practice
	96	* this simple method works.
	97	*/
55e303ae A	98
55e303ae A	99	// Entry point. This is a subroutine of bcopy(). When called:
91447636 A	100	// r0 = return address (also stored in caller's SF)
	101	// r4 = source ptr
	102	// r5 = length (at least several pages)
	103	// r12 = dest ptr
55e303ae	104	//
91447636 A	105	// We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles
91447636 A	106	// and r3 preserved.
55e303ae A	107
	108	.align 5
	109	bigcopy_970:
91447636 A	110	neg r2,r12 // is destination cache-line-aligned?
	111	std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy()
	112	std r13,rzR13(r1) // spill non-volatile regs we use to redzone
	113	std r14,rzR14(r1)
	114	std r15,rzR15(r1)
	115	andi. r2,r2,0x7F // #bytes to align
	116	std r16,rzR16(r1)
	117	mr rs,r4 // copy parameters into nonvolatile registers
	118	mr rd,r12
	119	mr rc,r5
	120	mr rx,r0 // also save return address
	121	beq 1f // skip if already aligned
55e303ae A	122
55e303ae A	123	// Cache-line-align destination.
91447636 A	124
	125	mr r3,rd // set up dest ptr for memcpy()
	126	mr r5,r2 // number of bytes to copy
	127	add rs,rs,r2 // then bump our parameters past initial copy
	128	add rd,rd,r2
	129	sub rc,rc,r2
	130	bla _COMM_PAGE_MEMCPY // 128-byte-align destination
55e303ae A	131
55e303ae A	132
91447636 A	133	// Load constant offsets and check whether source is 16-byte aligned.
	134	// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
	135	// and we dcbz only if cr7 beq is set.
55e303ae	136
91447636 A	137	1:
91447636 A	138	dcbt 0,rs // touch in 1st line of source
55e303ae A	139	andi. r0,rs,15 // check source alignment
55e303ae A	140	mfspr rv,vrsave // save caller's bitmask
55e303ae A	141	li c16,16 // load the constant offsets for x-form ops
55e303ae A	142	li c32,32
91447636 A	143	srwi r2,rc,8 // get number of 256-byte chunks to xfer
91447636 A	144	li r0,-256 // we use 24 VRs (ie, 0-23)
55e303ae	145	li c48,48
91447636 A	146	li c64,64
	147	li c80,80
	148	or r0,r0,rv // add our bits to caller's
	149	li c96,96
	150	mtctr r2 // set up loop count
	151	li c112,112
	152	cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128
	153	mtspr vrsave,r0 // say we use vr0..vr23
55e303ae A	154	li c256,256
55e303ae A	155	li c384,384
91447636	156	beq LalignedLoop // handle aligned sources
55e303ae	157
55e303ae	158
91447636	159	// Set up for unaligned loop.
55e303ae	160
55e303ae A	161	lvsl v0,0,rs // get permute vector for left shift
55e303ae A	162	lvxl v1,0,rs // prime the loop
91447636 A	163	li r0,rzV20 // save non-volatile VRs in redzone
	164	stvx v20,r1,r0
	165	li r0,rzV21
	166	stvx v21,r1,r0
	167	li r0,rzV22
	168	stvx v22,r1,r0
	169	li r0,rzV23
	170	stvx v23,r1,r0
55e303ae A	171	b LunalignedLoop // enter unaligned loop
	172
	173
91447636 A	174	// Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines).
91447636 A	175	// Destination is 128-byte aligned, source is unaligned.
55e303ae A	176
	177	.align 5
	178	LunalignedLoop:
91447636 A	179	dcbt c256,rs // touch in next chunk
	180	dcbt c384,rs
	181	addi r2,rs,128 // point to 2nd 128 bytes of source
55e303ae A	182	lvxl v2,c16,rs
55e303ae A	183	lvxl v3,c32,rs
91447636 A	184	lvxl v4,c48,rs
	185	lvxl v5,c64,rs
	186	lvxl v6,c80,rs
	187	lvxl v7,c96,rs
	188	lvxl v8,c112,rs
	189	lvxl v9,0,r2
	190	addi rs,rs,256 // point to next source chunk
	191	lvxl v10,c16,r2
	192	lvxl v11,c32,r2
	193	vperm v17,v1,v2,v0
	194	lvxl v12,c48,r2
	195	lvxl v13,c64,r2
	196	vperm v18,v2,v3,v0
	197	lvxl v14,c80,r2
	198	lvxl v15,c96,r2
	199	vperm v19,v3,v4,v0
	200	lvxl v16,c112,r2
	201	lvxl v1,0,rs // peek ahead at first source quad in next chunk
	202	vperm v20,v4,v5,v0
	203	addi r2,rd,128 // point to 2nd 128 bytes of dest
55e303ae	204	bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
91447636 A	205	dcbz128 0,rd
91447636 A	206	dcbz128 0,r2
55e303ae	207	1:
91447636 A	208	vperm v21,v5,v6,v0
	209	stvxl v17,0,rd
	210	vperm v22,v6,v7,v0
	211	stvxl v18,c16,rd
	212	vperm v23,v7,v8,v0
	213	stvxl v19,c32,rd
	214	vperm v17,v8,v9,v0
	215	stvxl v20,c48,rd
	216	vperm v18,v9,v10,v0
	217	stvxl v21,c64,rd
	218	vperm v19,v10,v11,v0
	219	stvxl v22,c80,rd
	220	vperm v20,v11,v12,v0
	221	stvxl v23,c96,rd
	222	vperm v21,v12,v13,v0
	223	stvxl v17,c112,rd
	224	vperm v22,v13,v14,v0
	225	addi rd,rd,256 // point to next dest chunk
	226	stvxl v18,0,r2
	227	vperm v23,v14,v15,v0
	228	stvxl v19,c16,r2
	229	vperm v17,v15,v16,v0
	230	stvxl v20,c32,r2
	231	vperm v18,v16,v1,v0
	232	stvxl v21,c48,r2
	233	stvxl v22,c64,r2
	234	stvxl v23,c80,r2
	235	stvxl v17,c96,r2
	236	stvxl v18,c112,r2
	237	bdnz++ LunalignedLoop // loop if another 256 bytes to go
	238
	239	li r6,rzV20 // restore non-volatile VRs
	240	li r7,rzV21
	241	li r8,rzV22
	242	li r9,rzV23
	243	lvx v20,r1,r6
	244	lvx v21,r1,r7
	245	lvx v22,r1,r8
	246	lvx v23,r1,r9
	247	b Ldone
55e303ae A	248
	249
	250	// Aligned loop. Destination is 128-byte aligned, and source is 16-byte
91447636	251	// aligned. Loop over 256-byte chunks (2 cache lines.)
55e303ae A	252
	253	.align 5
	254	LalignedLoop:
91447636 A	255	dcbt c256,rs // touch in next chunk
	256	dcbt c384,rs
	257	addi r2,rs,128 // point to 2nd 128 bytes of source
55e303ae A	258	lvxl v1,0,rs
55e303ae A	259	lvxl v2,c16,rs
55e303ae A	260	lvxl v3,c32,rs
55e303ae A	261	lvxl v4,c48,rs
91447636 A	262	lvxl v5,c64,rs
	263	lvxl v6,c80,rs
	264	lvxl v7,c96,rs
	265	lvxl v8,c112,rs
	266	lvxl v9,0,r2
	267	lvxl v10,c16,r2
	268	lvxl v11,c32,r2
	269	lvxl v12,c48,r2
	270	lvxl v13,c64,r2
	271	lvxl v14,c80,r2
	272	lvxl v15,c96,r2
	273	lvxl v16,c112,r2
	274	addi r2,rd,128 // point to 2nd 128 bytes of dest
	275	bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
	276	dcbz128 0,rd
	277	dcbz128 0,r2
	278	1:
	279	addi rs,rs,256 // point to next source chunk
55e303ae A	280	stvxl v1,0,rd
	281	stvxl v2,c16,rd
	282	stvxl v3,c32,rd
	283	stvxl v4,c48,rd
91447636 A	284	stvxl v5,c64,rd
	285	stvxl v6,c80,rd
	286	stvxl v7,c96,rd
	287	stvxl v8,c112,rd
	288	addi rd,rd,256 // point to next dest chunk
	289	stvxl v9,0,r2
	290	stvxl v10,c16,r2
	291	stvxl v11,c32,r2
	292	stvxl v12,c48,r2
	293	stvxl v13,c64,r2
	294	stvxl v14,c80,r2
	295	stvxl v15,c96,r2
	296	stvxl v16,c112,r2
	297	bdnz++ LalignedLoop // loop if another 256 bytes to go
	298
	299
	300	// Done, except for 0..255 leftover bytes at end.
55e303ae A	301	// rs = source ptr
55e303ae A	302	// rd = dest ptr
91447636	303	// rc = remaining count in low 7 bits
55e303ae	304	// rv = caller's vrsave
91447636	305	// rx = caller's return address
55e303ae A	306
55e303ae A	307	Ldone:
91447636 A	308	andi. r5,rc,0xFF // any leftover bytes? (0..255)
91447636 A	309	mtspr vrsave,rv // restore bitmap of live vr's
55e303ae	310
91447636 A	311	mr r3,rd
	312	mr r4,rs
	313	bnela _COMM_PAGE_MEMCPY // copy leftover bytes
	314
	315	mtlr rx // restore return address
	316	ld r3,rzR3(r1) // restore non-volatile GPRs from redzone
	317	ld r13,rzR13(r1)
	318	ld r14,rzR14(r1)
	319	ld r15,rzR15(r1)
	320	ld r16,rzR16(r1)
55e303ae A	321	blr
	322
	323
91447636	324	COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
55e303ae	325