[apple/xnu.git] / osfmk / ppc / commpage / bcopy_g3.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 2/20/2003, tuned for G3.
 *
 * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
 * environment.
 *
 *   r0  = "w7" or temp
 *   r2  = "w8"
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1"
 *   r7  = "w2"
 *   r8  = "w3"
 *   r9  = "w4"
 *   r10 = "w5"
 *   r11 = "w6"
 *   r12 = destination ptr ("rd")
 * f0-f3 = used for moving 8-byte aligned data
 */
#define rs	r4		// NB: we depend on rs==r4 in "lswx" instructions
#define rd	r12
#define rc	r5

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9
#define	w5	r10
#define	w6	r11
#define	w7	r0
#define	w8	r2

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text


#define	kLong	33					// too long for string ops


// Main entry points.

        .align 	5
bcopy_g3:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kLong			// length > 32 bytes?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        mr		rd,r4				// start to move source & dest to canonic spot
        bge		LLong0				// skip if long operand
        mtxer	rc					// set length for string ops
        lswx	r5,0,r3				// load bytes into r5-r12
        stswx	r5,0,r4				// store them
        blr

// NB: memcpy() and memmove() must follow bcopy() by 32 bytes, for comm page.

        .align	5
Lmemcpy_g3:							// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_g3:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kLong			// length > 32 bytes?
        sub		w1,r3,rs			// must move in reverse if (rd-rs)<rc
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong1				// longer than 32 bytes
        mtxer	rc					// set length for string ops
        lswx	r5,0,r4				// load bytes into r5-r12
        stswx	r5,0,r3				// store them
        blr

// Long operands (more than 32 bytes.)
//		w1  = (rd-rs), used to check for alignment

LLong0:								// enter from bcopy()
        mr		rs,r3				// must leave r3 alone (it is return value for memcpy)
LLong1:								// enter from memcpy() and memmove()
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        rlwinm	r0,w1,0,0x3			// are operands relatively word-aligned?
        neg		w2,rd				// prepare to align destination
        cmpwi	cr5,r0,0			// set cr5 beq if relatively word aligned
        blt		cr1,LLongReverse	// handle reverse move
        andi.	w4,w2,3				// w4 <- #bytes to word align destination
        beq		cr5,LLongFloat		// relatively aligned so use FPRs
        sub		rc,rc,w4			// adjust count for alignment
        srwi	r0,rc,5				// get #chunks to xfer (>=1)
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        beq		1f					// dest already word aligned
    
// Word align the destination.
        
        mtxer	w4					// byte count to xer
        cmpwi	r0,0				// any chunks to xfer?
        lswx	w1,0,rs				// move w4 bytes to align dest
        add		rs,rs,w4
        stswx	w1,0,rd
        add		rd,rd,w4
        beq-	2f					// pathologic case, no chunks to xfer

// Forward, unaligned loop.

1:
        lwz		w1,0(rs)
        lwz		w2,4(rs)
        lwz		w3,8(rs)
        lwz		w4,12(rs)
        lwz		w5,16(rs)
        lwz		w6,20(rs)
        lwz		w7,24(rs)
        lwz		w8,28(rs)
        addi	rs,rs,32
        stw		w1,0(rd)
        stw		w2,4(rd)
        stw		w3,8(rd)
        stw		w4,12(rd)
        stw		w5,16(rd)
        stw		w6,20(rd)
        stw		w7,24(rd)
        stw		w8,28(rd)
        addi	rd,rd,32
        bdnz	1b
2:									// rc = remaining bytes (0-31)
        mtxer	rc					// set up count for string ops
        mr		r0,rd				// move dest ptr out of the way
        lswx	r5,0,rs				// load xer bytes into r5-r12 (rs==r4)
        stswx	r5,0,r0				// store them
        blr
        

// Forward, aligned loop.  We use FPRs.

LLongFloat:
        andi.	w4,w2,7				// W4 <- #bytes to doubleword-align destination
        sub		rc,rc,w4			// adjust count for alignment
        srwi	r0,rc,5				// number of 32-byte chunks to xfer
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        beq		1f					// dest already doubleword aligned
    
// Doubleword align the destination.
        
        mtxer	w4					// byte count to xer
        cmpwi	r0,0				// any chunks to xfer?
        lswx	w1,0,rs				// move w4 bytes to align dest
        add		rs,rs,w4
        stswx	w1,0,rd
        add		rd,rd,w4
        beq-	2f					// pathologic case, no chunks to xfer
1:									// loop over 32-byte chunks
        lfd		f0,0(rs)
        lfd		f1,8(rs)
        lfd		f2,16(rs)
        lfd		f3,24(rs)
        addi	rs,rs,32
        stfd	f0,0(rd)
        stfd	f1,8(rd)
        stfd	f2,16(rd)
        stfd	f3,24(rd)
        addi	rd,rd,32
        bdnz	1b
2:									// rc = remaining bytes (0-31)
        mtxer	rc					// set up count for string ops
        mr		r0,rd				// move dest ptr out of the way
        lswx	r5,0,rs				// load xer bytes into r5-r12 (rs==r4)
        stswx	r5,0,r0				// store them
        blr

        
// Long, reverse moves.
//		cr5 = beq if relatively word aligned

LLongReverse:
        add		rd,rd,rc			// point to end of operands + 1
        add		rs,rs,rc
        beq		cr5,LReverseFloat	// aligned operands so can use FPRs
        srwi	r0,rc,5				// get chunk count
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        mtxer	rc					// set up for trailing bytes
1:
        lwz		w1,-4(rs)
        lwz		w2,-8(rs)
        lwz		w3,-12(rs)
        lwz		w4,-16(rs)
        stw		w1,-4(rd)
        lwz		w5,-20(rs)
        stw		w2,-8(rd)
        lwz		w6,-24(rs)
        stw		w3,-12(rd)
        lwz		w7,-28(rs)
        stw		w4,-16(rd)
        lwzu	w8,-32(rs)
        stw		w5,-20(rd)
        stw		w6,-24(rd)
        stw		w7,-28(rd)
        stwu	w8,-32(rd)
        bdnz	1b

        sub		r4,rs,rc			// point to 1st (leftmost) leftover byte (0..31)
        sub		r0,rd,rc			// move dest ptr out of way
        lswx	r5,0,r4				// load xer bytes into r5-r12
        stswx	r5,0,r0				// store them
        blr


// Long, reverse aligned moves.  We use FPRs.

LReverseFloat:
        andi.	w4,rd,7				// W3 <- #bytes to doubleword-align destination
        sub		rc,rc,w4			// adjust count for alignment
        srwi	r0,rc,5				// number of 32-byte chunks to xfer
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        beq		1f					// dest already doubleword aligned
    
// Doubleword align the destination.
        
        mtxer	w4					// byte count to xer
        cmpwi	r0,0				// any chunks to xfer?
        sub		rs,rs,w4			// point to 1st bytes to xfer
        sub		rd,rd,w4
        lswx	w1,0,rs				// move w3 bytes to align dest
        stswx	w1,0,rd
        beq-	2f					// pathologic case, no chunks to xfer
1:
        lfd		f0,-8(rs)
        lfd		f1,-16(rs)
        lfd		f2,-24(rs)
        lfdu	f3,-32(rs)
        stfd	f0,-8(rd)
        stfd	f1,-16(rd)
        stfd	f2,-24(rd)
        stfdu	f3,-32(rd)
        bdnz	1b
2:									// rc = remaining bytes (0-31)
        mtxer	rc					// set up count for string ops
        sub		r4,rs,rc			// point to 1st (leftmost) leftover byte (0..31)
        sub		r0,rd,rc			// move dest ptr out of way
        lswx	r5,0,r4				// load xer bytes into r5-r12
        stswx	r5,0,r0				// store them
        blr

	COMMPAGE_DESCRIPTOR(bcopy_g3,_COMM_PAGE_BCOPY,0,k64Bit+kHasAltivec,kCommPage32)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
6601e61a	4	* @APPLE_LICENSE_HEADER_START@
55e303ae	5	*
6601e61a A	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
8f6c56a5	11	*
6601e61a A	12	* This Original Code and all software distributed under the License are
6601e61a A	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
6601e61a A	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
8f6c56a5	19	*
6601e61a	20	* @APPLE_LICENSE_HEADER_END@
55e303ae A	21	*/
	22	/* =======================================
	23	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	24	* =======================================
	25	*
	26	* Version of 2/20/2003, tuned for G3.
	27	*
	28	* Register usage. Note we use R2, so this code will not run in a PEF/CFM
	29	* environment.
	30	*
	31	* r0 = "w7" or temp
	32	* r2 = "w8"
	33	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	34	* r4 = source ptr ("rs")
	35	* r5 = count of bytes to move ("rc")
	36	* r6 = "w1"
	37	* r7 = "w2"
	38	* r8 = "w3"
	39	* r9 = "w4"
	40	* r10 = "w5"
	41	* r11 = "w6"
	42	* r12 = destination ptr ("rd")
	43	* f0-f3 = used for moving 8-byte aligned data
	44	*/
	45	#define rs r4 // NB: we depend on rs==r4 in "lswx" instructions
	46	#define rd r12
	47	#define rc r5
	48
	49	#define w1 r6
	50	#define w2 r7
	51	#define w3 r8
	52	#define w4 r9
	53	#define w5 r10
	54	#define w6 r11
	55	#define w7 r0
	56	#define w8 r2
	57
	58	#define ASSEMBLER
	59	#include <sys/appleapiopts.h>
	60	#include <ppc/asm.h>
	61	#include <machine/cpu_capabilities.h>
	62	#include <machine/commpage.h>
	63
	64	.text
55e303ae A	65
	66
	67	#define kLong 33 // too long for string ops
	68
	69
	70	// Main entry points.
	71
	72	.align 5
	73	bcopy_g3: // void bcopy(const void src, void dst, size_t len)
	74	cmplwi rc,kLong // length > 32 bytes?
	75	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
	76	mr rd,r4 // start to move source & dest to canonic spot
	77	bge LLong0 // skip if long operand
	78	mtxer rc // set length for string ops
	79	lswx r5,0,r3 // load bytes into r5-r12
	80	stswx r5,0,r4 // store them
	81	blr
	82
	83	// NB: memcpy() and memmove() must follow bcopy() by 32 bytes, for comm page.
	84
	85	.align 5
	86	Lmemcpy_g3: // void* memcpy(void dst, void src, size_t len)
	87	Lmemmove_g3: // void* memmove(void dst, const void src, size_t len)
	88	cmplwi rc,kLong // length > 32 bytes?
	89	sub w1,r3,rs // must move in reverse if (rd-rs)<rc
	90	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
	91	bge LLong1 // longer than 32 bytes
	92	mtxer rc // set length for string ops
	93	lswx r5,0,r4 // load bytes into r5-r12
	94	stswx r5,0,r3 // store them
	95	blr
	96
	97	// Long operands (more than 32 bytes.)
	98	// w1 = (rd-rs), used to check for alignment
	99
	100	LLong0: // enter from bcopy()
	101	mr rs,r3 // must leave r3 alone (it is return value for memcpy)
	102	LLong1: // enter from memcpy() and memmove()
	103	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
	104	rlwinm r0,w1,0,0x3 // are operands relatively word-aligned?
	105	neg w2,rd // prepare to align destination
	106	cmpwi cr5,r0,0 // set cr5 beq if relatively word aligned
	107	blt cr1,LLongReverse // handle reverse move
	108	andi. w4,w2,3 // w4 <- #bytes to word align destination
	109	beq cr5,LLongFloat // relatively aligned so use FPRs
	110	sub rc,rc,w4 // adjust count for alignment
	111	srwi r0,rc,5 // get #chunks to xfer (>=1)
	112	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
	113	mtctr r0 // set up loop count
	114	beq 1f // dest already word aligned
	115
	116	// Word align the destination.
	117
	118	mtxer w4 // byte count to xer
	119	cmpwi r0,0 // any chunks to xfer?
	120	lswx w1,0,rs // move w4 bytes to align dest
	121	add rs,rs,w4
	122	stswx w1,0,rd
	123	add rd,rd,w4
	124	beq- 2f // pathologic case, no chunks to xfer
	125
	126	// Forward, unaligned loop.
	127
	128	1:
129	lwz w1,0(rs)
130	lwz w2,4(rs)
131	lwz w3,8(rs)
132	lwz w4,12(rs)
133	lwz w5,16(rs)
134	lwz w6,20(rs)
135	lwz w7,24(rs)
136	lwz w8,28(rs)
137	addi rs,rs,32
138	stw w1,0(rd)
139	stw w2,4(rd)
140	stw w3,8(rd)
141	stw w4,12(rd)
142	stw w5,16(rd)
143	stw w6,20(rd)
144	stw w7,24(rd)
145	stw w8,28(rd)
146	addi rd,rd,32
147	bdnz 1b
148	2: // rc = remaining bytes (0-31)
149	mtxer rc // set up count for string ops
150	mr r0,rd // move dest ptr out of the way
151	lswx r5,0,rs // load xer bytes into r5-r12 (rs==r4)
152	stswx r5,0,r0 // store them
153	blr
154
155
156
157	// Forward, aligned loop. We use FPRs.
158
159	LLongFloat:
160	andi. w4,w2,7 // W4 <- #bytes to doubleword-align destination
161	sub rc,rc,w4 // adjust count for alignment
162	srwi r0,rc,5 // number of 32-byte chunks to xfer
163	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
164	mtctr r0 // set up loop count
165	beq 1f // dest already doubleword aligned
166
167	// Doubleword align the destination.
168
169	mtxer w4 // byte count to xer
170	cmpwi r0,0 // any chunks to xfer?
171	lswx w1,0,rs // move w4 bytes to align dest
172	add rs,rs,w4
173	stswx w1,0,rd
174	add rd,rd,w4
175	beq- 2f // pathologic case, no chunks to xfer
176	1: // loop over 32-byte chunks
177	lfd f0,0(rs)
178	lfd f1,8(rs)
179	lfd f2,16(rs)
180	lfd f3,24(rs)
181	addi rs,rs,32
182	stfd f0,0(rd)
183	stfd f1,8(rd)
184	stfd f2,16(rd)
185	stfd f3,24(rd)
186	addi rd,rd,32
187	bdnz 1b
188	2: // rc = remaining bytes (0-31)
189	mtxer rc // set up count for string ops
190	mr r0,rd // move dest ptr out of the way
191	lswx r5,0,rs // load xer bytes into r5-r12 (rs==r4)
192	stswx r5,0,r0 // store them
193	blr
194
195
196	// Long, reverse moves.
197	// cr5 = beq if relatively word aligned
198
199	LLongReverse:
200	add rd,rd,rc // point to end of operands + 1
201	add rs,rs,rc
202	beq cr5,LReverseFloat // aligned operands so can use FPRs
203	srwi r0,rc,5 // get chunk count
204	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
205	mtctr r0 // set up loop count
206	mtxer rc // set up for trailing bytes
207	1:
208	lwz w1,-4(rs)
209	lwz w2,-8(rs)
210	lwz w3,-12(rs)
211	lwz w4,-16(rs)
212	stw w1,-4(rd)
213	lwz w5,-20(rs)
214	stw w2,-8(rd)
215	lwz w6,-24(rs)
216	stw w3,-12(rd)
217	lwz w7,-28(rs)
218	stw w4,-16(rd)
219	lwzu w8,-32(rs)
220	stw w5,-20(rd)
221	stw w6,-24(rd)
222	stw w7,-28(rd)
223	stwu w8,-32(rd)
224	bdnz 1b
225
226	sub r4,rs,rc // point to 1st (leftmost) leftover byte (0..31)
227	sub r0,rd,rc // move dest ptr out of way
228	lswx r5,0,r4 // load xer bytes into r5-r12
229	stswx r5,0,r0 // store them
230	blr
231
232
233	// Long, reverse aligned moves. We use FPRs.
234
235	LReverseFloat:
236	andi. w4,rd,7 // W3 <- #bytes to doubleword-align destination
237	sub rc,rc,w4 // adjust count for alignment
238	srwi r0,rc,5 // number of 32-byte chunks to xfer
239	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
240	mtctr r0 // set up loop count
241	beq 1f // dest already doubleword aligned
242
243	// Doubleword align the destination.
244
245	mtxer w4 // byte count to xer
246	cmpwi r0,0 // any chunks to xfer?
247	sub rs,rs,w4 // point to 1st bytes to xfer
248	sub rd,rd,w4
249	lswx w1,0,rs // move w3 bytes to align dest
250	stswx w1,0,rd
251	beq- 2f // pathologic case, no chunks to xfer
252	1:
253	lfd f0,-8(rs)
254	lfd f1,-16(rs)
255	lfd f2,-24(rs)
256	lfdu f3,-32(rs)
257	stfd f0,-8(rd)
258	stfd f1,-16(rd)
259	stfd f2,-24(rd)
260	stfdu f3,-32(rd)
261	bdnz 1b
262	2: // rc = remaining bytes (0-31)
263	mtxer rc // set up count for string ops
264	sub r4,rs,rc // point to 1st (leftmost) leftover byte (0..31)
265	sub r0,rd,rc // move dest ptr out of way
266	lswx r5,0,r4 // load xer bytes into r5-r12
267	stswx r5,0,r0 // store them
268	blr
269
91447636	270	COMMPAGE_DESCRIPTOR(bcopy_g3,_COMM_PAGE_BCOPY,0,k64Bit+kHasAltivec,kCommPage32)