[apple/xnu.git] / osfmk / ppc / commpage / bigcopy_970.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/* ====================================
 * Very Long Operand BCOPY for Mac OS X
 * ====================================
 *
 * Version of 6/11/2003, tuned for the IBM 970.  This is for operands at
 * least several pages long.  It is called from bcopy()/memcpy()/memmove().
 *
 * We use the following additional strategies not used by the shorter
 * operand paths.  Mostly, we try to optimize for memory bandwidth:
 *	1. Use DCBZ128 to avoid reading destination lines.  Because this code
 *     resides on the commmpage, it can use a private interface with the
 *     kernel to minimize alignment exceptions if the destination is
 *     uncached.  The kernel will clear cr7 whenever it emulates a DCBZ or
 *     DCBZ128 on the commpage.  Thus we take at most one exception per call,
 *     which is amortized across the very long operand.
 *	2. Copy larger chunks per iteration to minimize R/W bus turnaround
 *     and maximize DRAM page locality (opening a new page is expensive.)
 *  3. Touch in one source chunk ahead with DCBT.  This is probably the
 *     least important change, and probably only helps restart the
 *     hardware stream at the start of each source page.
 *
 * Register usage.  Note the rather delicate way we assign multiple uses
 * to the same register.  Beware.
 *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = constant 16 ("c16")
 *   r7  = constant 32 (""c32")
 *   r8  = constant 48 (""c48")
 *   r9  = constant 128 (""c128")
 *   r10 = vrsave ("rv")
 *   r11 = constant 256 (""c256")
 *   r12 = destination ptr ("rd")
 *	 r13 = constant 384 (""c384")
 *	 r14 = temp ("rx")
 *	 r15 = temp ("rt")
 */
#define rs	r4
#define rd	r12
#define rc	r5
#define	rv	r10
#define	rx	r14
#define	rt	r15

#define c16	r6
#define c32	r7
#define c48	r8
#define	c128	r9
#define	c256	r11
#define	c384	r13

// Offsets within the "red zone" (which is 224 bytes long):

#define rzR13	-8
#define rzR14	-12
#define rzR15	-16
#define rzV20	-32
#define rzV21	-48
#define rzV22	-64
#define rzV23	-80
#define rzV24	-96
#define rzV25	-112
#define rzV26	-128
#define rzV27	-144
#define rzV28	-160
#define rzV29	-176
#define rzV30	-192
#define rzV31	-208


#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
        .globl	EXT(bigcopy_970)


// Entry point.  This is a subroutine of bcopy().  When called:
//	r4 = source ptr (aka "rs")
// r12 = dest ptr (aka "rd")
//	r5 = length (>= 16K bytes) (aka "rc")
// 
// We only do "forward" moves, ie non-overlapping or toward 0.
//
// We return with non-volatiles and r3 preserved.

        .align 	5
bigcopy_970:
        stw		r13,rzR13(r1)		// spill non-volatile regs we use to redzone
        stw		r14,rzR14(r1)
        stw		r15,rzR15(r1)
        li		r0,rzV20
        neg		rt,rd				// start to cache-line-align destination
        stvx	v20,r1,r0			// we use all 32 VRs
        li		r0,rzV21
        stvx	v21,r1,r0
        li		r0,rzV22
        stvx	v22,r1,r0
        li		r0,rzV23
        stvx	v23,r1,r0
        li		r0,rzV24
        andi.	rt,rt,127			// get #bytes to 128-byte align
        stvx	v24,r1,r0
        li		r0,rzV25
        stvx	v25,r1,r0
        li		r0,rzV26
        sub		rc,rc,rt			// adjust length by #bytes to align destination
        stvx	v26,r1,r0
        li		r0,rzV27
        stvx	v27,r1,r0
        li		r0,rzV28
        mtctr	rt					// #bytes to align destination
        stvx	v28,r1,r0
        li		r0,rzV29
        stvx	v29,r1,r0
        li		r0,rzV30
        stvx	v30,r1,r0
        li		r0,rzV31
        stvx	v31,r1,r0
        beq		2f					// dest already 128-byte aligned
        b		1f


// Cache-line-align destination.

        .align	5
1:
        lbz		r0,0(rs)
        addi	rs,rs,1
        stb		r0,0(rd)
        addi	rd,rd,1
        bdnz	1b


// Is source 16-byte aligned?  Load constant offsets.

2:
        andi.	r0,rs,15			// check source alignment
        mfspr	rv,vrsave			// save caller's bitmask
        li		r0,-1				// we use all 32 VRs
        li		c16,16				// load the constant offsets for x-form ops
        li		c32,32
        li		c48,48
        li		c128,128
        li		c256,256
        li		c384,384
        mtspr	vrsave,r0

// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
// and we dcbz only if cr7 beq is set.  We check to be sure the dcbz's
// won't zero source bytes before we load them, since we zero before
// loading as this is faster than zeroing after loading and before storing.

        cmpw	cr7,r0,r0			// initialize cr7 beq to use dcbz128
        sub		rt,rs,rd			// get (rs-rd)
        cmplwi	cr1,rt,512			// are we moving down less than 512 bytes?
        
// Start fetching in source cache lines.

        dcbt	c128,rs				// first line already touched in
        dcbt	c256,rs
        dcbt	c384,rs
        
        bge++	cr1,3f				// skip if not moving down less than 512 bytes
        cmpw	cr7,c16,c32			// cannot dcbz since it would zero source bytes
3:
        beq		LalignedLoop		// handle aligned sources
        lvsl	v0,0,rs				// get permute vector for left shift
        lvxl	v1,0,rs				// prime the loop
        b		LunalignedLoop		// enter unaligned loop


// Main loop for unaligned operands.  We loop over 384-byte chunks (3 cache lines)
// since we need a few VRs for permuted destination QWs and the permute vector.

        .align	5
LunalignedLoop:
        subi	rc,rc,384			// decrement byte count
        addi	rx,rs,384			// get address of next chunk
        lvxl	v2,c16,rs
        lvxl	v3,c32,rs
        bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
        dcbz128	0,rd				// (also skip if moving down less than 512 bytes)
        bne--	cr7,1f				// catch it first time through
        dcbz128	c128,rd
        dcbz128	c256,rd
1:
        addi	rt,rs,64
        dcbt	0,rx				// touch in next chunk
        dcbt	c128,rx
        dcbt	c256,rx
        lvxl	v4,c48,rs
        addi	rs,rs,128
        lvxl	v5,0,rt
        cmplwi	rc,384				// another chunk to go?
        lvxl	v6,c16,rt
        lvxl	v7,c32,rt
        lvxl	v8,c48,rt
        addi	rt,rs,64
        vperm	v25,v1,v2,v0
        lvxl	v9,0,rs
        lvxl	v10,c16,rs
        vperm	v26,v2,v3,v0
        lvxl	v11,c32,rs
        lvxl	v12,c48,rs
        vperm	v27,v3,v4,v0
        addi	rs,rs,128
        lvxl	v13,0,rt
        lvxl	v14,c16,rt
        vperm	v28,v4,v5,v0
        lvxl	v15,c32,rt
        lvxl	v16,c48,rt
        vperm	v29,v5,v6,v0
        addi	rt,rs,64
        lvxl	v17,0,rs
        lvxl	v18,c16,rs
        vperm	v30,v6,v7,v0
        lvxl	v19,c32,rs
        lvxl	v20,c48,rs
        vperm	v31,v7,v8,v0
        addi	rs,rs,128
        lvxl	v21,0,rt
        lvxl	v22,c16,rt
        vperm	v2,v8,v9,v0
        lvxl	v23,c32,rt
        lvxl	v24,c48,rt
        vperm	v3,v9,v10,v0
        lvx		v1,0,rs				// get 1st qw of next chunk
        vperm	v4,v10,v11,v0
        
        addi	rt,rd,64
        stvxl	v25,0,rd
        stvxl	v26,c16,rd
        vperm	v5,v11,v12,v0
        stvxl	v27,c32,rd
        stvxl	v28,c48,rd
        vperm	v6,v12,v13,v0
        addi	rd,rd,128
        stvxl	v29,0,rt
        stvxl	v30,c16,rt
        vperm	v7,v13,v14,v0
        stvxl	v31,c32,rt
        stvxl	v2,c48,rt
        vperm	v8,v14,v15,v0
        addi	rt,rd,64
        stvxl	v3,0,rd
        stvxl	v4,c16,rd
        vperm	v9,v15,v16,v0
        stvxl	v5,c32,rd
        stvxl	v6,c48,rd
        vperm	v10,v16,v17,v0
        addi	rd,rd,128
        stvxl	v7,0,rt
        vperm	v11,v17,v18,v0
        stvxl	v8,c16,rt
        stvxl	v9,c32,rt
        vperm	v12,v18,v19,v0
        stvxl	v10,c48,rt
        addi	rt,rd,64
        vperm	v13,v19,v20,v0
        stvxl	v11,0,rd
        stvxl	v12,c16,rd
        vperm	v14,v20,v21,v0
        stvxl	v13,c32,rd
        vperm	v15,v21,v22,v0
        stvxl	v14,c48,rd
        vperm	v16,v22,v23,v0
        addi	rd,rd,128
        stvxl	v15,0,rt
        vperm	v17,v23,v24,v0
        stvxl	v16,c16,rt
        vperm	v18,v24,v1,v0
        stvxl	v17,c32,rt
        stvxl	v18,c48,rt
        bge++	LunalignedLoop		// loop if another 384 bytes to go

// End of unaligned main loop.  Handle up to 384 leftover bytes.

        srwi.	r0,rc,5				// get count of 32-byte chunks remaining
        beq		Ldone				// none
        rlwinm	rc,rc,0,0x1F		// mask count down to 0..31 leftover bytes
        mtctr	r0
1:									// loop over 32-byte chunks
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        addi	rs,rs,32
        vperm	v8,v1,v2,v0
        vperm	v9,v2,v3,v0
        vor		v1,v3,v3			// v1 <- v3
        stvx	v8,0,rd
        stvx	v9,c16,rd
        addi	rd,rd,32
        bdnz	1b
        
        b		Ldone
        
        
// Aligned loop.  Destination is 128-byte aligned, and source is 16-byte
// aligned.  Loop over 512-byte chunks (4 cache lines.)

        .align	5
LalignedLoop:
        subi	rc,rc,512			// decrement count
        addi	rx,rs,512			// address of next chunk
        lvxl	v1,0,rs
        lvxl	v2,c16,rs
        bne--	cr7,1f				// skip dcbz's if cr7 beq has been turned off by kernel
        dcbz128	0,rd				// (also skip if moving down less than 512 bytes)
        bne--	cr7,1f				// catch it first time through
        dcbz128	c128,rd
        dcbz128	c256,rd
        dcbz128	c384,rd
1:
        addi	rt,rs,64
        dcbt	0,rx				// touch in next chunk
        dcbt	c128,rx
        dcbt	c256,rx
        dcbt	c384,rx
        lvxl	v3,c32,rs
        lvxl	v4,c48,rs
        addi	rs,rs,128
        lvxl	v5,0,rt
        cmplwi	rc,512				// another chunk to go?
        lvxl	v6,c16,rt
        lvxl	v7,c32,rt
        lvxl	v8,c48,rt
        addi	rt,rs,64
        lvxl	v9,0,rs
        lvxl	v10,c16,rs
        lvxl	v11,c32,rs
        lvxl	v12,c48,rs
        addi	rs,rs,128
        lvxl	v13,0,rt
        lvxl	v14,c16,rt
        lvxl	v15,c32,rt
        lvxl	v16,c48,rt
        addi	rt,rs,64
        lvxl	v17,0,rs
        lvxl	v18,c16,rs
        lvxl	v19,c32,rs
        lvxl	v20,c48,rs
        addi	rs,rs,128
        lvxl	v21,0,rt
        lvxl	v22,c16,rt
        lvxl	v23,c32,rt
        lvxl	v24,c48,rt
        addi	rt,rs,64
        lvxl	v25,0,rs
        lvxl	v26,c16,rs
        lvxl	v27,c32,rs
        lvxl	v28,c48,rs
        addi	rs,rs,128
        lvxl	v29,0,rt
        lvxl	v30,c16,rt
        lvxl	v31,c32,rt
        lvxl	v0,c48,rt

        addi	rt,rd,64
        stvxl	v1,0,rd
        stvxl	v2,c16,rd
        stvxl	v3,c32,rd
        stvxl	v4,c48,rd
        addi	rd,rd,128
        stvxl	v5,0,rt
        stvxl	v6,c16,rt
        stvxl	v7,c32,rt
        stvxl	v8,c48,rt
        addi	rt,rd,64
        stvxl	v9,0,rd
        stvxl	v10,c16,rd
        stvxl	v11,c32,rd
        stvxl	v12,c48,rd
        addi	rd,rd,128
        stvxl	v13,0,rt
        stvxl	v14,c16,rt
        stvxl	v15,c32,rt
        stvxl	v16,c48,rt
        addi	rt,rd,64
        stvxl	v17,0,rd
        stvxl	v18,c16,rd
        stvxl	v19,c32,rd
        stvxl	v20,c48,rd
        addi	rd,rd,128
        stvxl	v21,0,rt
        stvxl	v22,c16,rt
        stvxl	v23,c32,rt
        stvxl	v24,c48,rt
        addi	rt,rd,64
        stvxl	v25,0,rd
        stvxl	v26,c16,rd
        stvxl	v27,c32,rd
        stvxl	v28,c48,rd
        addi	rd,rd,128
        stvxl	v29,0,rt
        stvxl	v30,c16,rt
        stvxl	v31,c32,rt
        stvxl	v0,c48,rt
        bge++	LalignedLoop		// loop if another 512 bytes to go

// End of aligned main loop.  Handle up to 511 leftover bytes.

        srwi.	r0,rc,5				// get count of 32-byte chunks remaining
        beq		Ldone				// none
        rlwinm	rc,rc,0,0x1F		// mask count down to 0..31 leftover bytes
        mtctr	r0
1:									// loop over 32-byte chunks
        lvx		v1,0,rs
        lvx		v2,c16,rs
        addi	rs,rs,32
        stvx	v1,0,rd
        stvx	v2,c16,rd
        addi	rd,rd,32
        bdnz	1b


// Done, except for 0..31 leftovers at end.  Restore non-volatiles.
//	rs = source ptr
//	rd = dest ptr
//	rc = count (0..31)
//	rv = caller's vrsave

Ldone:
        cmpwi	rc,0				// any leftover bytes?
        lwz		r13,rzR13(r1)		// restore non-volatiles from redzone
        lwz		r14,rzR14(r1)
        lwz		r15,rzR15(r1)
        li		r0,rzV20
        lvx		v20,r1,r0
        li		r0,rzV21
        lvx		v21,r1,r0
        li		r0,rzV22
        lvx		v22,r1,r0
        li		r0,rzV23
        lvx		v23,r1,r0
        li		r0,rzV24
        lvx		v24,r1,r0
        li		r0,rzV25
        lvx		v25,r1,r0
        li		r0,rzV26
        lvx		v26,r1,r0
        li		r0,rzV27
        lvx		v27,r1,r0
        li		r0,rzV28
        lvx		v28,r1,r0
        li		r0,rzV29
        lvx		v29,r1,r0
        li		r0,rzV30
        lvx		v30,r1,r0
        li		r0,rzV31
        lvx		v31,r1,r0
        mtspr	vrsave,rv			// restore caller's bitmask
        beqlr						// done if no leftover bytes
        

// Handle 1..31 leftover bytes at end.

        mtctr	rc					// set up loop count
        b		1f
        
        .align	5
1:
        lbz		r0,0(rs)
        addi	rs,rs,1
        stb		r0,0(rd)
        addi	rd,rd,1
        bdnz	1b
        
        blr


        COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now
Commit	Line	Data
d7e50217 A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
	7	*
	8	* This file contains Original Code and/or Modifications of Original Code
	9	* as defined in and that are subject to the Apple Public Source License
	10	* Version 2.0 (the 'License'). You may not use this file except in
	11	* compliance with the License. Please obtain a copy of the License at
	12	* http://www.opensource.apple.com/apsl/ and read it before using this
	13	* file.
	14	*
	15	* The Original Code and all software distributed under the License are
	16	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	17	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	18	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	19	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	20	* Please see the License for the specific language governing rights and
	21	* limitations under the License.
	22	*
	23	* @APPLE_LICENSE_HEADER_END@
	24	*/
	25	/* ====================================
	26	* Very Long Operand BCOPY for Mac OS X
	27	* ====================================
	28	*
	29	* Version of 6/11/2003, tuned for the IBM 970. This is for operands at
	30	* least several pages long. It is called from bcopy()/memcpy()/memmove().
	31	*
	32	* We use the following additional strategies not used by the shorter
	33	* operand paths. Mostly, we try to optimize for memory bandwidth:
	34	* 1. Use DCBZ128 to avoid reading destination lines. Because this code
	35	* resides on the commmpage, it can use a private interface with the
	36	* kernel to minimize alignment exceptions if the destination is
	37	* uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
	38	* DCBZ128 on the commpage. Thus we take at most one exception per call,
	39	* which is amortized across the very long operand.
	40	* 2. Copy larger chunks per iteration to minimize R/W bus turnaround
	41	* and maximize DRAM page locality (opening a new page is expensive.)
	42	* 3. Touch in one source chunk ahead with DCBT. This is probably the
	43	* least important change, and probably only helps restart the
	44	* hardware stream at the start of each source page.
	45	*
	46	* Register usage. Note the rather delicate way we assign multiple uses
	47	* to the same register. Beware.
	48	* r0 = temp (NB: cannot use r0 for any constant such as "c16")
	49	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	50	* r4 = source ptr ("rs")
	51	* r5 = count of bytes to move ("rc")
	52	* r6 = constant 16 ("c16")
	53	* r7 = constant 32 (""c32")
	54	* r8 = constant 48 (""c48")
	55	* r9 = constant 128 (""c128")
	56	* r10 = vrsave ("rv")
	57	* r11 = constant 256 (""c256")
	58	* r12 = destination ptr ("rd")
	59	* r13 = constant 384 (""c384")
	60	* r14 = temp ("rx")
	61	* r15 = temp ("rt")
	62	*/
	63	#define rs r4
	64	#define rd r12
65	#define rc r5
66	#define rv r10
67	#define rx r14
68	#define rt r15
69
70	#define c16 r6
71	#define c32 r7
72	#define c48 r8
73	#define c128 r9
74	#define c256 r11
75	#define c384 r13
76
77	// Offsets within the "red zone" (which is 224 bytes long):
78
79	#define rzR13 -8
80	#define rzR14 -12
81	#define rzR15 -16
82	#define rzV20 -32
83	#define rzV21 -48
84	#define rzV22 -64
85	#define rzV23 -80
86	#define rzV24 -96
87	#define rzV25 -112
88	#define rzV26 -128
89	#define rzV27 -144
90	#define rzV28 -160
91	#define rzV29 -176
92	#define rzV30 -192
93	#define rzV31 -208
94
95
96	#include <sys/appleapiopts.h>
97	#include <ppc/asm.h>
98	#include <machine/cpu_capabilities.h>
99	#include <machine/commpage.h>
100
101	.text
102	.globl EXT(bigcopy_970)
103
104
105	// Entry point. This is a subroutine of bcopy(). When called:
106	// r4 = source ptr (aka "rs")
107	// r12 = dest ptr (aka "rd")
108	// r5 = length (>= 16K bytes) (aka "rc")
109	//
110	// We only do "forward" moves, ie non-overlapping or toward 0.
111	//
112	// We return with non-volatiles and r3 preserved.
113
114	.align 5
115	bigcopy_970:
116	stw r13,rzR13(r1) // spill non-volatile regs we use to redzone
117	stw r14,rzR14(r1)
118	stw r15,rzR15(r1)
119	li r0,rzV20
120	neg rt,rd // start to cache-line-align destination
121	stvx v20,r1,r0 // we use all 32 VRs
122	li r0,rzV21
123	stvx v21,r1,r0
124	li r0,rzV22
125	stvx v22,r1,r0
126	li r0,rzV23
127	stvx v23,r1,r0
128	li r0,rzV24
129	andi. rt,rt,127 // get #bytes to 128-byte align
130	stvx v24,r1,r0
131	li r0,rzV25
132	stvx v25,r1,r0
133	li r0,rzV26
134	sub rc,rc,rt // adjust length by #bytes to align destination
135	stvx v26,r1,r0
136	li r0,rzV27
137	stvx v27,r1,r0
138	li r0,rzV28
139	mtctr rt // #bytes to align destination
140	stvx v28,r1,r0
141	li r0,rzV29
142	stvx v29,r1,r0
143	li r0,rzV30
144	stvx v30,r1,r0
145	li r0,rzV31
146	stvx v31,r1,r0
147	beq 2f // dest already 128-byte aligned
148	b 1f
149
150
151	// Cache-line-align destination.
152
153	.align 5
154	1:
155	lbz r0,0(rs)
156	addi rs,rs,1
157	stb r0,0(rd)
158	addi rd,rd,1
159	bdnz 1b
160
161
162	// Is source 16-byte aligned? Load constant offsets.
163
164	2:
165	andi. r0,rs,15 // check source alignment
166	mfspr rv,vrsave // save caller's bitmask
167	li r0,-1 // we use all 32 VRs
168	li c16,16 // load the constant offsets for x-form ops
169	li c32,32
170	li c48,48
171	li c128,128
172	li c256,256
173	li c384,384
174	mtspr vrsave,r0
175
176	// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
177	// and we dcbz only if cr7 beq is set. We check to be sure the dcbz's
178	// won't zero source bytes before we load them, since we zero before
179	// loading as this is faster than zeroing after loading and before storing.
180
181	cmpw cr7,r0,r0 // initialize cr7 beq to use dcbz128
182	sub rt,rs,rd // get (rs-rd)
183	cmplwi cr1,rt,512 // are we moving down less than 512 bytes?
184
185	// Start fetching in source cache lines.
186
187	dcbt c128,rs // first line already touched in
188	dcbt c256,rs
189	dcbt c384,rs
190
191	bge++ cr1,3f // skip if not moving down less than 512 bytes
192	cmpw cr7,c16,c32 // cannot dcbz since it would zero source bytes
193	3:
194	beq LalignedLoop // handle aligned sources
195	lvsl v0,0,rs // get permute vector for left shift
196	lvxl v1,0,rs // prime the loop
197	b LunalignedLoop // enter unaligned loop
198
199
200	// Main loop for unaligned operands. We loop over 384-byte chunks (3 cache lines)
201	// since we need a few VRs for permuted destination QWs and the permute vector.
202
203	.align 5
204	LunalignedLoop:
205	subi rc,rc,384 // decrement byte count
206	addi rx,rs,384 // get address of next chunk
207	lvxl v2,c16,rs
208	lvxl v3,c32,rs
209	bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
210	dcbz128 0,rd // (also skip if moving down less than 512 bytes)
211	bne-- cr7,1f // catch it first time through
212	dcbz128 c128,rd
213	dcbz128 c256,rd
214	1:
215	addi rt,rs,64
216	dcbt 0,rx // touch in next chunk
217	dcbt c128,rx
218	dcbt c256,rx
219	lvxl v4,c48,rs
220	addi rs,rs,128
221	lvxl v5,0,rt
222	cmplwi rc,384 // another chunk to go?
223	lvxl v6,c16,rt
224	lvxl v7,c32,rt
225	lvxl v8,c48,rt
226	addi rt,rs,64
227	vperm v25,v1,v2,v0
228	lvxl v9,0,rs
229	lvxl v10,c16,rs
230	vperm v26,v2,v3,v0
231	lvxl v11,c32,rs
232	lvxl v12,c48,rs
233	vperm v27,v3,v4,v0
234	addi rs,rs,128
235	lvxl v13,0,rt
236	lvxl v14,c16,rt
237	vperm v28,v4,v5,v0
238	lvxl v15,c32,rt
239	lvxl v16,c48,rt
240	vperm v29,v5,v6,v0
241	addi rt,rs,64
242	lvxl v17,0,rs
243	lvxl v18,c16,rs
244	vperm v30,v6,v7,v0
245	lvxl v19,c32,rs
246	lvxl v20,c48,rs
247	vperm v31,v7,v8,v0
248	addi rs,rs,128
249	lvxl v21,0,rt
250	lvxl v22,c16,rt
251	vperm v2,v8,v9,v0
252	lvxl v23,c32,rt
253	lvxl v24,c48,rt
254	vperm v3,v9,v10,v0
255	lvx v1,0,rs // get 1st qw of next chunk
256	vperm v4,v10,v11,v0
257
258	addi rt,rd,64
259	stvxl v25,0,rd
260	stvxl v26,c16,rd
261	vperm v5,v11,v12,v0
262	stvxl v27,c32,rd
263	stvxl v28,c48,rd
264	vperm v6,v12,v13,v0
265	addi rd,rd,128
266	stvxl v29,0,rt
267	stvxl v30,c16,rt
268	vperm v7,v13,v14,v0
269	stvxl v31,c32,rt
270	stvxl v2,c48,rt
271	vperm v8,v14,v15,v0
272	addi rt,rd,64
273	stvxl v3,0,rd
274	stvxl v4,c16,rd
275	vperm v9,v15,v16,v0
276	stvxl v5,c32,rd
277	stvxl v6,c48,rd
278	vperm v10,v16,v17,v0
279	addi rd,rd,128
280	stvxl v7,0,rt
281	vperm v11,v17,v18,v0
282	stvxl v8,c16,rt
283	stvxl v9,c32,rt
284	vperm v12,v18,v19,v0
285	stvxl v10,c48,rt
286	addi rt,rd,64
287	vperm v13,v19,v20,v0
288	stvxl v11,0,rd
289	stvxl v12,c16,rd
290	vperm v14,v20,v21,v0
291	stvxl v13,c32,rd
292	vperm v15,v21,v22,v0
293	stvxl v14,c48,rd
294	vperm v16,v22,v23,v0
295	addi rd,rd,128
296	stvxl v15,0,rt
297	vperm v17,v23,v24,v0
298	stvxl v16,c16,rt
299	vperm v18,v24,v1,v0
300	stvxl v17,c32,rt
301	stvxl v18,c48,rt
302	bge++ LunalignedLoop // loop if another 384 bytes to go
303
304	// End of unaligned main loop. Handle up to 384 leftover bytes.
305
306	srwi. r0,rc,5 // get count of 32-byte chunks remaining
307	beq Ldone // none
308	rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes
309	mtctr r0
310	1: // loop over 32-byte chunks
311	lvx v2,c16,rs
312	lvx v3,c32,rs
313	addi rs,rs,32
314	vperm v8,v1,v2,v0
315	vperm v9,v2,v3,v0
316	vor v1,v3,v3 // v1 <- v3
317	stvx v8,0,rd
318	stvx v9,c16,rd
319	addi rd,rd,32
320	bdnz 1b
321
322	b Ldone
323
324
325	// Aligned loop. Destination is 128-byte aligned, and source is 16-byte
326	// aligned. Loop over 512-byte chunks (4 cache lines.)
327
328	.align 5
329	LalignedLoop:
330	subi rc,rc,512 // decrement count
331	addi rx,rs,512 // address of next chunk
332	lvxl v1,0,rs
333	lvxl v2,c16,rs
334	bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
335	dcbz128 0,rd // (also skip if moving down less than 512 bytes)
336	bne-- cr7,1f // catch it first time through
337	dcbz128 c128,rd
338	dcbz128 c256,rd
339	dcbz128 c384,rd
340	1:
341	addi rt,rs,64
342	dcbt 0,rx // touch in next chunk
343	dcbt c128,rx
344	dcbt c256,rx
345	dcbt c384,rx
346	lvxl v3,c32,rs
347	lvxl v4,c48,rs
348	addi rs,rs,128
349	lvxl v5,0,rt
350	cmplwi rc,512 // another chunk to go?
351	lvxl v6,c16,rt
352	lvxl v7,c32,rt
353	lvxl v8,c48,rt
354	addi rt,rs,64
355	lvxl v9,0,rs
356	lvxl v10,c16,rs
357	lvxl v11,c32,rs
358	lvxl v12,c48,rs
359	addi rs,rs,128
360	lvxl v13,0,rt
361	lvxl v14,c16,rt
362	lvxl v15,c32,rt
363	lvxl v16,c48,rt
364	addi rt,rs,64
365	lvxl v17,0,rs
366	lvxl v18,c16,rs
367	lvxl v19,c32,rs
368	lvxl v20,c48,rs
369	addi rs,rs,128
370	lvxl v21,0,rt
371	lvxl v22,c16,rt
372	lvxl v23,c32,rt
373	lvxl v24,c48,rt
374	addi rt,rs,64
375	lvxl v25,0,rs
376	lvxl v26,c16,rs
377	lvxl v27,c32,rs
378	lvxl v28,c48,rs
379	addi rs,rs,128
380	lvxl v29,0,rt
381	lvxl v30,c16,rt
382	lvxl v31,c32,rt
383	lvxl v0,c48,rt
384
385	addi rt,rd,64
386	stvxl v1,0,rd
387	stvxl v2,c16,rd
388	stvxl v3,c32,rd
389	stvxl v4,c48,rd
390	addi rd,rd,128
391	stvxl v5,0,rt
392	stvxl v6,c16,rt
393	stvxl v7,c32,rt
394	stvxl v8,c48,rt
395	addi rt,rd,64
396	stvxl v9,0,rd
397	stvxl v10,c16,rd
398	stvxl v11,c32,rd
399	stvxl v12,c48,rd
400	addi rd,rd,128
401	stvxl v13,0,rt
402	stvxl v14,c16,rt
403	stvxl v15,c32,rt
404	stvxl v16,c48,rt
405	addi rt,rd,64
406	stvxl v17,0,rd
407	stvxl v18,c16,rd
408	stvxl v19,c32,rd
409	stvxl v20,c48,rd
410	addi rd,rd,128
411	stvxl v21,0,rt
412	stvxl v22,c16,rt
413	stvxl v23,c32,rt
414	stvxl v24,c48,rt
415	addi rt,rd,64
416	stvxl v25,0,rd
417	stvxl v26,c16,rd
418	stvxl v27,c32,rd
419	stvxl v28,c48,rd
420	addi rd,rd,128
421	stvxl v29,0,rt
422	stvxl v30,c16,rt
423	stvxl v31,c32,rt
424	stvxl v0,c48,rt
425	bge++ LalignedLoop // loop if another 512 bytes to go
426
427	// End of aligned main loop. Handle up to 511 leftover bytes.
428
429	srwi. r0,rc,5 // get count of 32-byte chunks remaining
430	beq Ldone // none
431	rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes
432	mtctr r0
433	1: // loop over 32-byte chunks
434	lvx v1,0,rs
435	lvx v2,c16,rs
436	addi rs,rs,32
437	stvx v1,0,rd
438	stvx v2,c16,rd
439	addi rd,rd,32
440	bdnz 1b
441
442
443	// Done, except for 0..31 leftovers at end. Restore non-volatiles.
444	// rs = source ptr
445	// rd = dest ptr
446	// rc = count (0..31)
447	// rv = caller's vrsave
448
449	Ldone:
450	cmpwi rc,0 // any leftover bytes?
451	lwz r13,rzR13(r1) // restore non-volatiles from redzone
452	lwz r14,rzR14(r1)
453	lwz r15,rzR15(r1)
454	li r0,rzV20
455	lvx v20,r1,r0
456	li r0,rzV21
457	lvx v21,r1,r0
458	li r0,rzV22
459	lvx v22,r1,r0
460	li r0,rzV23
461	lvx v23,r1,r0
462	li r0,rzV24
463	lvx v24,r1,r0
464	li r0,rzV25
465	lvx v25,r1,r0
466	li r0,rzV26
467	lvx v26,r1,r0
468	li r0,rzV27
469	lvx v27,r1,r0
470	li r0,rzV28
471	lvx v28,r1,r0
472	li r0,rzV29
473	lvx v29,r1,r0
474	li r0,rzV30
475	lvx v30,r1,r0
476	li r0,rzV31
477	lvx v31,r1,r0
478	mtspr vrsave,rv // restore caller's bitmask
479	beqlr // done if no leftover bytes
480
481
482	// Handle 1..31 leftover bytes at end.
483
484	mtctr rc // set up loop count
485	b 1f
486
487	.align 5
488	1:
489	lbz r0,0(rs)
490	addi rs,rs,1
491	stb r0,0(rd)
492	addi rd,rd,1
493	bdnz 1b
494
495	blr
496
497
498	COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now
499