[apple/xnu.git] / osfmk / ppc / commpage / bcopy_g3.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 2/20/2003, tuned for G3.
 *
 * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
 * environment.
 *
 *   r0  = "w7" or temp
 *   r2  = "w8"
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1"
 *   r7  = "w2"
 *   r8  = "w3"
 *   r9  = "w4"
 *   r10 = "w5"
 *   r11 = "w6"
 *   r12 = destination ptr ("rd")
 * f0-f3 = used for moving 8-byte aligned data
 */
#define rs	r4		// NB: we depend on rs==r4 in "lswx" instructions
#define rd	r12
#define rc	r5

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9
#define	w5	r10
#define	w6	r11
#define	w7	r0
#define	w8	r2

#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text


#define	kLong	33					// too long for string ops


// Main entry points.

        .align 	5
bcopy_g3:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kLong			// length > 32 bytes?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        mr		rd,r4				// start to move source & dest to canonic spot
        bge		LLong0				// skip if long operand
        mtxer	rc					// set length for string ops
        lswx	r5,0,r3				// load bytes into r5-r12
        stswx	r5,0,r4				// store them
        blr

// NB: memcpy() and memmove() must follow bcopy() by 32 bytes, for comm page.

        .align	5
Lmemcpy_g3:							// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_g3:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kLong			// length > 32 bytes?
        sub		w1,r3,rs			// must move in reverse if (rd-rs)<rc
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong1				// longer than 32 bytes
        mtxer	rc					// set length for string ops
        lswx	r5,0,r4				// load bytes into r5-r12
        stswx	r5,0,r3				// store them
        blr

// Long operands (more than 32 bytes.)
//		w1  = (rd-rs), used to check for alignment

LLong0:								// enter from bcopy()
        mr		rs,r3				// must leave r3 alone (it is return value for memcpy)
LLong1:								// enter from memcpy() and memmove()
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        rlwinm	r0,w1,0,0x3			// are operands relatively word-aligned?
        neg		w2,rd				// prepare to align destination
        cmpwi	cr5,r0,0			// set cr5 beq if relatively word aligned
        blt		cr1,LLongReverse	// handle reverse move
        andi.	w4,w2,3				// w4 <- #bytes to word align destination
        beq		cr5,LLongFloat		// relatively aligned so use FPRs
        sub		rc,rc,w4			// adjust count for alignment
        srwi	r0,rc,5				// get #chunks to xfer (>=1)
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        beq		1f					// dest already word aligned
    
// Word align the destination.
        
        mtxer	w4					// byte count to xer
        cmpwi	r0,0				// any chunks to xfer?
        lswx	w1,0,rs				// move w4 bytes to align dest
        add		rs,rs,w4
        stswx	w1,0,rd
        add		rd,rd,w4
        beq-	2f					// pathologic case, no chunks to xfer

// Forward, unaligned loop.

1:
        lwz		w1,0(rs)
        lwz		w2,4(rs)
        lwz		w3,8(rs)
        lwz		w4,12(rs)
        lwz		w5,16(rs)
        lwz		w6,20(rs)
        lwz		w7,24(rs)
        lwz		w8,28(rs)
        addi	rs,rs,32
        stw		w1,0(rd)
        stw		w2,4(rd)
        stw		w3,8(rd)
        stw		w4,12(rd)
        stw		w5,16(rd)
        stw		w6,20(rd)
        stw		w7,24(rd)
        stw		w8,28(rd)
        addi	rd,rd,32
        bdnz	1b
2:									// rc = remaining bytes (0-31)
        mtxer	rc					// set up count for string ops
        mr		r0,rd				// move dest ptr out of the way
        lswx	r5,0,rs				// load xer bytes into r5-r12 (rs==r4)
        stswx	r5,0,r0				// store them
        blr
        

// Forward, aligned loop.  We use FPRs.

LLongFloat:
        andi.	w4,w2,7				// W4 <- #bytes to doubleword-align destination
        sub		rc,rc,w4			// adjust count for alignment
        srwi	r0,rc,5				// number of 32-byte chunks to xfer
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        beq		1f					// dest already doubleword aligned
    
// Doubleword align the destination.
        
        mtxer	w4					// byte count to xer
        cmpwi	r0,0				// any chunks to xfer?
        lswx	w1,0,rs				// move w4 bytes to align dest
        add		rs,rs,w4
        stswx	w1,0,rd
        add		rd,rd,w4
        beq-	2f					// pathologic case, no chunks to xfer
1:									// loop over 32-byte chunks
        lfd		f0,0(rs)
        lfd		f1,8(rs)
        lfd		f2,16(rs)
        lfd		f3,24(rs)
        addi	rs,rs,32
        stfd	f0,0(rd)
        stfd	f1,8(rd)
        stfd	f2,16(rd)
        stfd	f3,24(rd)
        addi	rd,rd,32
        bdnz	1b
2:									// rc = remaining bytes (0-31)
        mtxer	rc					// set up count for string ops
        mr		r0,rd				// move dest ptr out of the way
        lswx	r5,0,rs				// load xer bytes into r5-r12 (rs==r4)
        stswx	r5,0,r0				// store them
        blr

        
// Long, reverse moves.
//		cr5 = beq if relatively word aligned

LLongReverse:
        add		rd,rd,rc			// point to end of operands + 1
        add		rs,rs,rc
        beq		cr5,LReverseFloat	// aligned operands so can use FPRs
        srwi	r0,rc,5				// get chunk count
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        mtxer	rc					// set up for trailing bytes
1:
        lwz		w1,-4(rs)
        lwz		w2,-8(rs)
        lwz		w3,-12(rs)
        lwz		w4,-16(rs)
        stw		w1,-4(rd)
        lwz		w5,-20(rs)
        stw		w2,-8(rd)
        lwz		w6,-24(rs)
        stw		w3,-12(rd)
        lwz		w7,-28(rs)
        stw		w4,-16(rd)
        lwzu	w8,-32(rs)
        stw		w5,-20(rd)
        stw		w6,-24(rd)
        stw		w7,-28(rd)
        stwu	w8,-32(rd)
        bdnz	1b

        sub		r4,rs,rc			// point to 1st (leftmost) leftover byte (0..31)
        sub		r0,rd,rc			// move dest ptr out of way
        lswx	r5,0,r4				// load xer bytes into r5-r12
        stswx	r5,0,r0				// store them
        blr


// Long, reverse aligned moves.  We use FPRs.

LReverseFloat:
        andi.	w4,rd,7				// W3 <- #bytes to doubleword-align destination
        sub		rc,rc,w4			// adjust count for alignment
        srwi	r0,rc,5				// number of 32-byte chunks to xfer
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        beq		1f					// dest already doubleword aligned
    
// Doubleword align the destination.
        
        mtxer	w4					// byte count to xer
        cmpwi	r0,0				// any chunks to xfer?
        sub		rs,rs,w4			// point to 1st bytes to xfer
        sub		rd,rd,w4
        lswx	w1,0,rs				// move w3 bytes to align dest
        stswx	w1,0,rd
        beq-	2f					// pathologic case, no chunks to xfer
1:
        lfd		f0,-8(rs)
        lfd		f1,-16(rs)
        lfd		f2,-24(rs)
        lfdu	f3,-32(rs)
        stfd	f0,-8(rd)
        stfd	f1,-16(rd)
        stfd	f2,-24(rd)
        stfdu	f3,-32(rd)
        bdnz	1b
2:									// rc = remaining bytes (0-31)
        mtxer	rc					// set up count for string ops
        sub		r4,rs,rc			// point to 1st (leftmost) leftover byte (0..31)
        sub		r0,rd,rc			// move dest ptr out of way
        lswx	r5,0,r4				// load xer bytes into r5-r12
        stswx	r5,0,r0				// store them
        blr

	COMMPAGE_DESCRIPTOR(bcopy_g3,_COMM_PAGE_BCOPY,0,k64Bit+kHasAltivec,kCommPage32)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
2d21ac55	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
55e303ae	5	*
2d21ac55 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
8f6c56a5	14	*
2d21ac55 A	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55 A	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
8f6c56a5	25	*
2d21ac55	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
55e303ae A	27	*/
	28	/* =======================================
	29	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	30	* =======================================
	31	*
	32	* Version of 2/20/2003, tuned for G3.
	33	*
	34	* Register usage. Note we use R2, so this code will not run in a PEF/CFM
	35	* environment.
	36	*
	37	* r0 = "w7" or temp
	38	* r2 = "w8"
	39	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	40	* r4 = source ptr ("rs")
	41	* r5 = count of bytes to move ("rc")
	42	* r6 = "w1"
	43	* r7 = "w2"
	44	* r8 = "w3"
	45	* r9 = "w4"
	46	* r10 = "w5"
	47	* r11 = "w6"
	48	* r12 = destination ptr ("rd")
	49	* f0-f3 = used for moving 8-byte aligned data
	50	*/
	51	#define rs r4 // NB: we depend on rs==r4 in "lswx" instructions
	52	#define rd r12
	53	#define rc r5
	54
	55	#define w1 r6
	56	#define w2 r7
	57	#define w3 r8
	58	#define w4 r9
	59	#define w5 r10
	60	#define w6 r11
	61	#define w7 r0
	62	#define w8 r2
	63
55e303ae A	64	#include <sys/appleapiopts.h>
	65	#include <ppc/asm.h>
	66	#include <machine/cpu_capabilities.h>
	67	#include <machine/commpage.h>
	68
	69	.text
55e303ae A	70
	71
	72	#define kLong 33 // too long for string ops
	73
	74
	75	// Main entry points.
	76
	77	.align 5
	78	bcopy_g3: // void bcopy(const void src, void dst, size_t len)
	79	cmplwi rc,kLong // length > 32 bytes?
	80	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
	81	mr rd,r4 // start to move source & dest to canonic spot
	82	bge LLong0 // skip if long operand
	83	mtxer rc // set length for string ops
	84	lswx r5,0,r3 // load bytes into r5-r12
	85	stswx r5,0,r4 // store them
	86	blr
	87
	88	// NB: memcpy() and memmove() must follow bcopy() by 32 bytes, for comm page.
	89
	90	.align 5
	91	Lmemcpy_g3: // void* memcpy(void dst, void src, size_t len)
	92	Lmemmove_g3: // void* memmove(void dst, const void src, size_t len)
	93	cmplwi rc,kLong // length > 32 bytes?
	94	sub w1,r3,rs // must move in reverse if (rd-rs)<rc
	95	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
	96	bge LLong1 // longer than 32 bytes
	97	mtxer rc // set length for string ops
	98	lswx r5,0,r4 // load bytes into r5-r12
	99	stswx r5,0,r3 // store them
	100	blr
	101
	102	// Long operands (more than 32 bytes.)
	103	// w1 = (rd-rs), used to check for alignment
	104
	105	LLong0: // enter from bcopy()
	106	mr rs,r3 // must leave r3 alone (it is return value for memcpy)
	107	LLong1: // enter from memcpy() and memmove()
	108	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
	109	rlwinm r0,w1,0,0x3 // are operands relatively word-aligned?
	110	neg w2,rd // prepare to align destination
	111	cmpwi cr5,r0,0 // set cr5 beq if relatively word aligned
	112	blt cr1,LLongReverse // handle reverse move
	113	andi. w4,w2,3 // w4 <- #bytes to word align destination
	114	beq cr5,LLongFloat // relatively aligned so use FPRs
	115	sub rc,rc,w4 // adjust count for alignment
	116	srwi r0,rc,5 // get #chunks to xfer (>=1)
	117	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
	118	mtctr r0 // set up loop count
	119	beq 1f // dest already word aligned
	120
	121	// Word align the destination.
	122
	123	mtxer w4 // byte count to xer
	124	cmpwi r0,0 // any chunks to xfer?
	125	lswx w1,0,rs // move w4 bytes to align dest
	126	add rs,rs,w4
	127	stswx w1,0,rd
	128	add rd,rd,w4
	129	beq- 2f // pathologic case, no chunks to xfer
	130
	131	// Forward, unaligned loop.
	132
	133	1:
134	lwz w1,0(rs)
135	lwz w2,4(rs)
136	lwz w3,8(rs)
137	lwz w4,12(rs)
138	lwz w5,16(rs)
139	lwz w6,20(rs)
140	lwz w7,24(rs)
141	lwz w8,28(rs)
142	addi rs,rs,32
143	stw w1,0(rd)
144	stw w2,4(rd)
145	stw w3,8(rd)
146	stw w4,12(rd)
147	stw w5,16(rd)
148	stw w6,20(rd)
149	stw w7,24(rd)
150	stw w8,28(rd)
151	addi rd,rd,32
152	bdnz 1b
153	2: // rc = remaining bytes (0-31)
154	mtxer rc // set up count for string ops
155	mr r0,rd // move dest ptr out of the way
156	lswx r5,0,rs // load xer bytes into r5-r12 (rs==r4)
157	stswx r5,0,r0 // store them
158	blr
159
160
161
162	// Forward, aligned loop. We use FPRs.
163
164	LLongFloat:
165	andi. w4,w2,7 // W4 <- #bytes to doubleword-align destination
166	sub rc,rc,w4 // adjust count for alignment
167	srwi r0,rc,5 // number of 32-byte chunks to xfer
168	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
169	mtctr r0 // set up loop count
170	beq 1f // dest already doubleword aligned
171
172	// Doubleword align the destination.
173
174	mtxer w4 // byte count to xer
175	cmpwi r0,0 // any chunks to xfer?
176	lswx w1,0,rs // move w4 bytes to align dest
177	add rs,rs,w4
178	stswx w1,0,rd
179	add rd,rd,w4
180	beq- 2f // pathologic case, no chunks to xfer
181	1: // loop over 32-byte chunks
182	lfd f0,0(rs)
183	lfd f1,8(rs)
184	lfd f2,16(rs)
185	lfd f3,24(rs)
186	addi rs,rs,32
187	stfd f0,0(rd)
188	stfd f1,8(rd)
189	stfd f2,16(rd)
190	stfd f3,24(rd)
191	addi rd,rd,32
192	bdnz 1b
193	2: // rc = remaining bytes (0-31)
194	mtxer rc // set up count for string ops
195	mr r0,rd // move dest ptr out of the way
196	lswx r5,0,rs // load xer bytes into r5-r12 (rs==r4)
197	stswx r5,0,r0 // store them
198	blr
199
200
201	// Long, reverse moves.
202	// cr5 = beq if relatively word aligned
203
204	LLongReverse:
205	add rd,rd,rc // point to end of operands + 1
206	add rs,rs,rc
207	beq cr5,LReverseFloat // aligned operands so can use FPRs
208	srwi r0,rc,5 // get chunk count
209	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
210	mtctr r0 // set up loop count
211	mtxer rc // set up for trailing bytes
212	1:
213	lwz w1,-4(rs)
214	lwz w2,-8(rs)
215	lwz w3,-12(rs)
216	lwz w4,-16(rs)
217	stw w1,-4(rd)
218	lwz w5,-20(rs)
219	stw w2,-8(rd)
220	lwz w6,-24(rs)
221	stw w3,-12(rd)
222	lwz w7,-28(rs)
223	stw w4,-16(rd)
224	lwzu w8,-32(rs)
225	stw w5,-20(rd)
226	stw w6,-24(rd)
227	stw w7,-28(rd)
228	stwu w8,-32(rd)
229	bdnz 1b
230
231	sub r4,rs,rc // point to 1st (leftmost) leftover byte (0..31)
232	sub r0,rd,rc // move dest ptr out of way
233	lswx r5,0,r4 // load xer bytes into r5-r12
234	stswx r5,0,r0 // store them
235	blr
236
237
238	// Long, reverse aligned moves. We use FPRs.
239
240	LReverseFloat:
241	andi. w4,rd,7 // W3 <- #bytes to doubleword-align destination
242	sub rc,rc,w4 // adjust count for alignment
243	srwi r0,rc,5 // number of 32-byte chunks to xfer
244	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
245	mtctr r0 // set up loop count
246	beq 1f // dest already doubleword aligned
247
248	// Doubleword align the destination.
249
250	mtxer w4 // byte count to xer
251	cmpwi r0,0 // any chunks to xfer?
252	sub rs,rs,w4 // point to 1st bytes to xfer
253	sub rd,rd,w4
254	lswx w1,0,rs // move w3 bytes to align dest
255	stswx w1,0,rd
256	beq- 2f // pathologic case, no chunks to xfer
257	1:
258	lfd f0,-8(rs)
259	lfd f1,-16(rs)
260	lfd f2,-24(rs)
261	lfdu f3,-32(rs)
262	stfd f0,-8(rd)
263	stfd f1,-16(rd)
264	stfd f2,-24(rd)
265	stfdu f3,-32(rd)
266	bdnz 1b
267	2: // rc = remaining bytes (0-31)
268	mtxer rc // set up count for string ops
269	sub r4,rs,rc // point to 1st (leftmost) leftover byte (0..31)
270	sub r0,rd,rc // move dest ptr out of way
271	lswx r5,0,r4 // load xer bytes into r5-r12
272	stswx r5,0,r0 // store them
273	blr
274
91447636	275	COMMPAGE_DESCRIPTOR(bcopy_g3,_COMM_PAGE_BCOPY,0,k64Bit+kHasAltivec,kCommPage32)