[apple/xnu.git] / osfmk / ppc / commpage / bcopy_g3.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 2/20/2003, tuned for G3.
 *
 * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
 * environment.
 *
 *   r0  = "w7" or temp
 *   r2  = "w8"
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1"
 *   r7  = "w2"
 *   r8  = "w3"
 *   r9  = "w4"
 *   r10 = "w5"
 *   r11 = "w6"
 *   r12 = destination ptr ("rd")
 * f0-f3 = used for moving 8-byte aligned data
 */
#define rs	r4		// NB: we depend on rs==r4 in "lswx" instructions
#define rd	r12
#define rc	r5

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9
#define	w5	r10
#define	w6	r11
#define	w7	r0
#define	w8	r2

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
        .globl 	EXT(bcopy_g3)


#define	kLong	33					// too long for string ops


// Main entry points.

        .align 	5
bcopy_g3:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kLong			// length > 32 bytes?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        mr		rd,r4				// start to move source & dest to canonic spot
        bge		LLong0				// skip if long operand
        mtxer	rc					// set length for string ops
        lswx	r5,0,r3				// load bytes into r5-r12
        stswx	r5,0,r4				// store them
        blr

// NB: memcpy() and memmove() must follow bcopy() by 32 bytes, for comm page.

        .align	5
Lmemcpy_g3:							// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_g3:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kLong			// length > 32 bytes?
        sub		w1,r3,rs			// must move in reverse if (rd-rs)<rc
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong1				// longer than 32 bytes
        mtxer	rc					// set length for string ops
        lswx	r5,0,r4				// load bytes into r5-r12
        stswx	r5,0,r3				// store them
        blr

// Long operands (more than 32 bytes.)
//		w1  = (rd-rs), used to check for alignment

LLong0:								// enter from bcopy()
        mr		rs,r3				// must leave r3 alone (it is return value for memcpy)
LLong1:								// enter from memcpy() and memmove()
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        rlwinm	r0,w1,0,0x3			// are operands relatively word-aligned?
        neg		w2,rd				// prepare to align destination
        cmpwi	cr5,r0,0			// set cr5 beq if relatively word aligned
        blt		cr1,LLongReverse	// handle reverse move
        andi.	w4,w2,3				// w4 <- #bytes to word align destination
        beq		cr5,LLongFloat		// relatively aligned so use FPRs
        sub		rc,rc,w4			// adjust count for alignment
        srwi	r0,rc,5				// get #chunks to xfer (>=1)
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        beq		1f					// dest already word aligned
    
// Word align the destination.
        
        mtxer	w4					// byte count to xer
        cmpwi	r0,0				// any chunks to xfer?
        lswx	w1,0,rs				// move w4 bytes to align dest
        add		rs,rs,w4
        stswx	w1,0,rd
        add		rd,rd,w4
        beq-	2f					// pathologic case, no chunks to xfer

// Forward, unaligned loop.

1:
        lwz		w1,0(rs)
        lwz		w2,4(rs)
        lwz		w3,8(rs)
        lwz		w4,12(rs)
        lwz		w5,16(rs)
        lwz		w6,20(rs)
        lwz		w7,24(rs)
        lwz		w8,28(rs)
        addi	rs,rs,32
        stw		w1,0(rd)
        stw		w2,4(rd)
        stw		w3,8(rd)
        stw		w4,12(rd)
        stw		w5,16(rd)
        stw		w6,20(rd)
        stw		w7,24(rd)
        stw		w8,28(rd)
        addi	rd,rd,32
        bdnz	1b
2:									// rc = remaining bytes (0-31)
        mtxer	rc					// set up count for string ops
        mr		r0,rd				// move dest ptr out of the way
        lswx	r5,0,rs				// load xer bytes into r5-r12 (rs==r4)
        stswx	r5,0,r0				// store them
        blr
        

// Forward, aligned loop.  We use FPRs.

LLongFloat:
        andi.	w4,w2,7				// W4 <- #bytes to doubleword-align destination
        sub		rc,rc,w4			// adjust count for alignment
        srwi	r0,rc,5				// number of 32-byte chunks to xfer
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        beq		1f					// dest already doubleword aligned
    
// Doubleword align the destination.
        
        mtxer	w4					// byte count to xer
        cmpwi	r0,0				// any chunks to xfer?
        lswx	w1,0,rs				// move w4 bytes to align dest
        add		rs,rs,w4
        stswx	w1,0,rd
        add		rd,rd,w4
        beq-	2f					// pathologic case, no chunks to xfer
1:									// loop over 32-byte chunks
        lfd		f0,0(rs)
        lfd		f1,8(rs)
        lfd		f2,16(rs)
        lfd		f3,24(rs)
        addi	rs,rs,32
        stfd	f0,0(rd)
        stfd	f1,8(rd)
        stfd	f2,16(rd)
        stfd	f3,24(rd)
        addi	rd,rd,32
        bdnz	1b
2:									// rc = remaining bytes (0-31)
        mtxer	rc					// set up count for string ops
        mr		r0,rd				// move dest ptr out of the way
        lswx	r5,0,rs				// load xer bytes into r5-r12 (rs==r4)
        stswx	r5,0,r0				// store them
        blr

        
// Long, reverse moves.
//		cr5 = beq if relatively word aligned

LLongReverse:
        add		rd,rd,rc			// point to end of operands + 1
        add		rs,rs,rc
        beq		cr5,LReverseFloat	// aligned operands so can use FPRs
        srwi	r0,rc,5				// get chunk count
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        mtxer	rc					// set up for trailing bytes
1:
        lwz		w1,-4(rs)
        lwz		w2,-8(rs)
        lwz		w3,-12(rs)
        lwz		w4,-16(rs)
        stw		w1,-4(rd)
        lwz		w5,-20(rs)
        stw		w2,-8(rd)
        lwz		w6,-24(rs)
        stw		w3,-12(rd)
        lwz		w7,-28(rs)
        stw		w4,-16(rd)
        lwzu	w8,-32(rs)
        stw		w5,-20(rd)
        stw		w6,-24(rd)
        stw		w7,-28(rd)
        stwu	w8,-32(rd)
        bdnz	1b

        sub		r4,rs,rc			// point to 1st (leftmost) leftover byte (0..31)
        sub		r0,rd,rc			// move dest ptr out of way
        lswx	r5,0,r4				// load xer bytes into r5-r12
        stswx	r5,0,r0				// store them
        blr


// Long, reverse aligned moves.  We use FPRs.

LReverseFloat:
        andi.	w4,rd,7				// W3 <- #bytes to doubleword-align destination
        sub		rc,rc,w4			// adjust count for alignment
        srwi	r0,rc,5				// number of 32-byte chunks to xfer
        rlwinm	rc,rc,0,0x1F		// mask down to leftover bytes
        mtctr	r0					// set up loop count
        beq		1f					// dest already doubleword aligned
    
// Doubleword align the destination.
        
        mtxer	w4					// byte count to xer
        cmpwi	r0,0				// any chunks to xfer?
        sub		rs,rs,w4			// point to 1st bytes to xfer
        sub		rd,rd,w4
        lswx	w1,0,rs				// move w3 bytes to align dest
        stswx	w1,0,rd
        beq-	2f					// pathologic case, no chunks to xfer
1:
        lfd		f0,-8(rs)
        lfd		f1,-16(rs)
        lfd		f2,-24(rs)
        lfdu	f3,-32(rs)
        stfd	f0,-8(rd)
        stfd	f1,-16(rd)
        stfd	f2,-24(rd)
        stfdu	f3,-32(rd)
        bdnz	1b
2:									// rc = remaining bytes (0-31)
        mtxer	rc					// set up count for string ops
        sub		r4,rs,rc			// point to 1st (leftmost) leftover byte (0..31)
        sub		r0,rd,rc			// move dest ptr out of way
        lswx	r5,0,r4				// load xer bytes into r5-r12
        stswx	r5,0,r0				// store them
        blr

        COMMPAGE_DESCRIPTOR(bcopy_g3,_COMM_PAGE_BCOPY,0,k64Bit+kHasAltivec,0)
Commit	Line	Data
d7e50217 A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
	7	*
	8	* This file contains Original Code and/or Modifications of Original Code
	9	* as defined in and that are subject to the Apple Public Source License
	10	* Version 2.0 (the 'License'). You may not use this file except in
	11	* compliance with the License. Please obtain a copy of the License at
	12	* http://www.opensource.apple.com/apsl/ and read it before using this
	13	* file.
	14	*
	15	* The Original Code and all software distributed under the License are
	16	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	17	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	18	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	19	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	20	* Please see the License for the specific language governing rights and
	21	* limitations under the License.
	22	*
	23	* @APPLE_LICENSE_HEADER_END@
	24	*/
	25	/* =======================================
	26	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	27	* =======================================
	28	*
	29	* Version of 2/20/2003, tuned for G3.
	30	*
	31	* Register usage. Note we use R2, so this code will not run in a PEF/CFM
	32	* environment.
	33	*
	34	* r0 = "w7" or temp
	35	* r2 = "w8"
	36	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	37	* r4 = source ptr ("rs")
	38	* r5 = count of bytes to move ("rc")
	39	* r6 = "w1"
	40	* r7 = "w2"
	41	* r8 = "w3"
	42	* r9 = "w4"
	43	* r10 = "w5"
	44	* r11 = "w6"
	45	* r12 = destination ptr ("rd")
	46	* f0-f3 = used for moving 8-byte aligned data
	47	*/
	48	#define rs r4 // NB: we depend on rs==r4 in "lswx" instructions
	49	#define rd r12
	50	#define rc r5
	51
	52	#define w1 r6
	53	#define w2 r7
	54	#define w3 r8
	55	#define w4 r9
	56	#define w5 r10
	57	#define w6 r11
	58	#define w7 r0
	59	#define w8 r2
	60
	61	#define ASSEMBLER
	62	#include <sys/appleapiopts.h>
	63	#include <ppc/asm.h>
	64	#include <machine/cpu_capabilities.h>
65	#include <machine/commpage.h>
66
67	.text
68	.globl EXT(bcopy_g3)
69
70
71	#define kLong 33 // too long for string ops
72
73
74	// Main entry points.
75
76	.align 5
77	bcopy_g3: // void bcopy(const void src, void dst, size_t len)
78	cmplwi rc,kLong // length > 32 bytes?
79	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
80	mr rd,r4 // start to move source & dest to canonic spot
81	bge LLong0 // skip if long operand
82	mtxer rc // set length for string ops
83	lswx r5,0,r3 // load bytes into r5-r12
84	stswx r5,0,r4 // store them
85	blr
86
87	// NB: memcpy() and memmove() must follow bcopy() by 32 bytes, for comm page.
88
89	.align 5
90	Lmemcpy_g3: // void* memcpy(void dst, void src, size_t len)
91	Lmemmove_g3: // void* memmove(void dst, const void src, size_t len)
92	cmplwi rc,kLong // length > 32 bytes?
93	sub w1,r3,rs // must move in reverse if (rd-rs)<rc
94	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
95	bge LLong1 // longer than 32 bytes
96	mtxer rc // set length for string ops
97	lswx r5,0,r4 // load bytes into r5-r12
98	stswx r5,0,r3 // store them
99	blr
100
101	// Long operands (more than 32 bytes.)
102	// w1 = (rd-rs), used to check for alignment
103
104	LLong0: // enter from bcopy()
105	mr rs,r3 // must leave r3 alone (it is return value for memcpy)
106	LLong1: // enter from memcpy() and memmove()
107	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
108	rlwinm r0,w1,0,0x3 // are operands relatively word-aligned?
109	neg w2,rd // prepare to align destination
110	cmpwi cr5,r0,0 // set cr5 beq if relatively word aligned
111	blt cr1,LLongReverse // handle reverse move
112	andi. w4,w2,3 // w4 <- #bytes to word align destination
113	beq cr5,LLongFloat // relatively aligned so use FPRs
114	sub rc,rc,w4 // adjust count for alignment
115	srwi r0,rc,5 // get #chunks to xfer (>=1)
116	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
117	mtctr r0 // set up loop count
118	beq 1f // dest already word aligned
119
120	// Word align the destination.
121
122	mtxer w4 // byte count to xer
123	cmpwi r0,0 // any chunks to xfer?
124	lswx w1,0,rs // move w4 bytes to align dest
125	add rs,rs,w4
126	stswx w1,0,rd
127	add rd,rd,w4
128	beq- 2f // pathologic case, no chunks to xfer
129
130	// Forward, unaligned loop.
131
132	1:
133	lwz w1,0(rs)
134	lwz w2,4(rs)
135	lwz w3,8(rs)
136	lwz w4,12(rs)
137	lwz w5,16(rs)
138	lwz w6,20(rs)
139	lwz w7,24(rs)
140	lwz w8,28(rs)
141	addi rs,rs,32
142	stw w1,0(rd)
143	stw w2,4(rd)
144	stw w3,8(rd)
145	stw w4,12(rd)
146	stw w5,16(rd)
147	stw w6,20(rd)
148	stw w7,24(rd)
149	stw w8,28(rd)
150	addi rd,rd,32
151	bdnz 1b
152	2: // rc = remaining bytes (0-31)
153	mtxer rc // set up count for string ops
154	mr r0,rd // move dest ptr out of the way
155	lswx r5,0,rs // load xer bytes into r5-r12 (rs==r4)
156	stswx r5,0,r0 // store them
157	blr
158
159
160
161	// Forward, aligned loop. We use FPRs.
162
163	LLongFloat:
164	andi. w4,w2,7 // W4 <- #bytes to doubleword-align destination
165	sub rc,rc,w4 // adjust count for alignment
166	srwi r0,rc,5 // number of 32-byte chunks to xfer
167	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
168	mtctr r0 // set up loop count
169	beq 1f // dest already doubleword aligned
170
171	// Doubleword align the destination.
172
173	mtxer w4 // byte count to xer
174	cmpwi r0,0 // any chunks to xfer?
175	lswx w1,0,rs // move w4 bytes to align dest
176	add rs,rs,w4
177	stswx w1,0,rd
178	add rd,rd,w4
179	beq- 2f // pathologic case, no chunks to xfer
180	1: // loop over 32-byte chunks
181	lfd f0,0(rs)
182	lfd f1,8(rs)
183	lfd f2,16(rs)
184	lfd f3,24(rs)
185	addi rs,rs,32
186	stfd f0,0(rd)
187	stfd f1,8(rd)
188	stfd f2,16(rd)
189	stfd f3,24(rd)
190	addi rd,rd,32
191	bdnz 1b
192	2: // rc = remaining bytes (0-31)
193	mtxer rc // set up count for string ops
194	mr r0,rd // move dest ptr out of the way
195	lswx r5,0,rs // load xer bytes into r5-r12 (rs==r4)
196	stswx r5,0,r0 // store them
197	blr
198
199
200	// Long, reverse moves.
201	// cr5 = beq if relatively word aligned
202
203	LLongReverse:
204	add rd,rd,rc // point to end of operands + 1
205	add rs,rs,rc
206	beq cr5,LReverseFloat // aligned operands so can use FPRs
207	srwi r0,rc,5 // get chunk count
208	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
209	mtctr r0 // set up loop count
210	mtxer rc // set up for trailing bytes
211	1:
212	lwz w1,-4(rs)
213	lwz w2,-8(rs)
214	lwz w3,-12(rs)
215	lwz w4,-16(rs)
216	stw w1,-4(rd)
217	lwz w5,-20(rs)
218	stw w2,-8(rd)
219	lwz w6,-24(rs)
220	stw w3,-12(rd)
221	lwz w7,-28(rs)
222	stw w4,-16(rd)
223	lwzu w8,-32(rs)
224	stw w5,-20(rd)
225	stw w6,-24(rd)
226	stw w7,-28(rd)
227	stwu w8,-32(rd)
228	bdnz 1b
229
230	sub r4,rs,rc // point to 1st (leftmost) leftover byte (0..31)
231	sub r0,rd,rc // move dest ptr out of way
232	lswx r5,0,r4 // load xer bytes into r5-r12
233	stswx r5,0,r0 // store them
234	blr
235
236
237	// Long, reverse aligned moves. We use FPRs.
238
239	LReverseFloat:
240	andi. w4,rd,7 // W3 <- #bytes to doubleword-align destination
241	sub rc,rc,w4 // adjust count for alignment
242	srwi r0,rc,5 // number of 32-byte chunks to xfer
243	rlwinm rc,rc,0,0x1F // mask down to leftover bytes
244	mtctr r0 // set up loop count
245	beq 1f // dest already doubleword aligned
246
247	// Doubleword align the destination.
248
249	mtxer w4 // byte count to xer
250	cmpwi r0,0 // any chunks to xfer?
251	sub rs,rs,w4 // point to 1st bytes to xfer
252	sub rd,rd,w4
253	lswx w1,0,rs // move w3 bytes to align dest
254	stswx w1,0,rd
255	beq- 2f // pathologic case, no chunks to xfer
256	1:
257	lfd f0,-8(rs)
258	lfd f1,-16(rs)
259	lfd f2,-24(rs)
260	lfdu f3,-32(rs)
261	stfd f0,-8(rd)
262	stfd f1,-16(rd)
263	stfd f2,-24(rd)
264	stfdu f3,-32(rd)
265	bdnz 1b
266	2: // rc = remaining bytes (0-31)
267	mtxer rc // set up count for string ops
268	sub r4,rs,rc // point to 1st (leftmost) leftover byte (0..31)
269	sub r0,rd,rc // move dest ptr out of way
270	lswx r5,0,r4 // load xer bytes into r5-r12
271	stswx r5,0,r0 // store them
272	blr
273
274	COMMPAGE_DESCRIPTOR(bcopy_g3,_COMM_PAGE_BCOPY,0,k64Bit+kHasAltivec,0)