[apple/xnu.git] / osfmk / ppc / commpage / bcopy_64.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
 * This version might be used bringing up new processors, with known
 * Altivec bugs that need to be worked around.  It is not particularly well
 * optimized.
 *
 * For 64-bit processors with a 128-byte cache line, running in either 
 * 32- or 64-bit mode.  This is written for 32-bit execution, the kernel
 * will translate to 64-bit code when it compiles the 64-bit commpage.
 *
 * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
 * environment.
 *   r0  = "w7" or temp
 *   r2  = "w8"
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1"
 *   r7  = "w2"
 *   r8  = "w3"
 *   r9  = "w4"
 *   r10 = "w5"
 *   r11 = "w6"
 *   r12 = destination ptr ("rd")
 */
#define rs	r4
#define rd	r12
#define rc	r5
#define	rv	r2

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9
#define	w5	r10
#define	w6	r11
#define	w7	r0
#define	w8	r2

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text

#define	kLong		64				// too long for inline loopless code


// Main entry points.

        .align 	5
bcopy_64:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kLong			// short or long?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        mr		rd,r4				// start to move registers to canonic spot
        mr		rs,r3
        blt		LShort				// handle short operands
        dcbt	0,r3				// touch in destination
        b		LLong				// join medium/long operand code

// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
        
        .align	5
Lmemcpy_g4:							// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_g4:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kLong			// short or long?
        sub		w1,r3,r4			// must move in reverse if (rd-rs)<rc
        dcbt	0,r4				// touch in the first line of source
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong				// handle medium or long operands

// Handle short operands.
        
LShort:
        mtcrf	0x02,rc				// put length bits 26-27 in cr6 (faster one cr at a time)
        mtcrf	0x01,rc				// put length bits 28-31 in cr7
        blt		cr1,LShortReverse
        
// Forward short operands.  This is the most frequent case, so it is inline.

LShort64:							// enter to xfer last 64 bytes
        bf		26,0f				// 64-byte chunk to xfer?
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
0:
        bf		27,1f				// quadword to move?
        ld		w1,0(rs)
        ld		w2,8(rs)
        addi	rs,rs,16
        std		w1,0(rd)
        std		w2,8(rd)
        addi	rd,rd,16
1:
        bf		28,2f				// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		30,4f				// halfword to move?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
4:
        bflr	31					// skip if no odd byte
        lbz		w1,0(rs)
        stb		w1,0(rd)
        blr
        
        
// Handle short reverse operands.
//		cr6 = bits 26-27 of length
//		cr7 = bits 28-31 of length      

LShortReverse:
        add		rs,rs,rc			// adjust ptrs for reverse move
        add		rd,rd,rc
LShortReverse64:					// enter to xfer last 64 bytes
        bf		26,0f				// 64-byte chunk to xfer?
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ldu		w4,-32(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        stdu	w4,-32(rd)
0:
        bf		27,1f				// quadword to move?
        ld		w1,-8(rs)
        ldu		w2,-16(rs)
        std		w1,-8(rd)
        stdu	w2,-16(rd)
1:
        bf		28,2f				// doubleword?
        ldu		w1,-8(rs)
        stdu	w1,-8(rd)
2:
        bf		29,3f				// word?
        lwzu	w1,-4(rs)
        stwu	w1,-4(rd)
3:
        bf		30,4f				// halfword to move?
        lhzu	w1,-2(rs)
        sthu	w1,-2(rd)
4:
        bflr	31					// done if no odd byte
        lbz 	w1,-1(rs)			// no update
        stb 	w1,-1(rd)
        blr
        

// Long operands.
//     cr1 = blt iff we must move reverse

        .align	4
LLong:
        dcbtst	0,rd				// touch in destination
        neg		w3,rd				// start to compute #bytes to align destination
        andi.	w6,w3,7				// w6 <- #bytes to 8-byte align destination
        blt		cr1,LLongReverse	// handle reverse moves
        mtctr	w6					// set up for loop to align destination
        sub		rc,rc,w6			// adjust count
        beq		LAligned			// destination already 8-byte aligned
1:
        lbz		w1,0(rs)
        addi	rs,rs,1
        stb		w1,0(rd)
        addi	rd,rd,1
        bdnz	1b
        
// Destination is 8-byte aligned.

LAligned:
        srwi.	w2,rc,6				// w2 <- count of 64-byte chunks
        mtcrf	0x02,rc				// leftover byte count to cr (faster one cr at a time)
        mtcrf	0x01,rc				// put length bits 28-31 in cr7
        beq		LShort64			// no 64-byte chunks
        mtctr	w2
        b		1f
        
// Loop moving 64-byte chunks.

        .align	5
1:
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        ld		w5,32(rs)
        ld		w6,40(rs)
        ld		w7,48(rs)
        ld		w8,56(rs)
        addi	rs,rs,64
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        std		w5,32(rd)
        std		w6,40(rd)
        std		w7,48(rd)
        std		w8,56(rd)
        addi	rd,rd,64
        bdnz	1b
        
        b		LShort64

        
// Handle reverse moves.

LLongReverse:
        add		rd,rd,rc				// point to end of operands
        add		rs,rs,rc
        andi.	r0,rd,7					// is destination 8-byte aligned?
        sub		rc,rc,r0				// adjust count
        mtctr	r0						// set up for byte loop
        beq		LRevAligned				// already aligned
        
1:
        lbzu	w1,-1(rs)
        stbu	w1,-1(rd)
        bdnz	1b

// Destination is 8-byte aligned.

LRevAligned:
        srwi.	w2,rc,6				// w2 <- count of 64-byte chunks
        mtcrf	0x02,rc				// leftover byte count to cr (faster one cr at a time)
        mtcrf	0x01,rc				// put length bits 28-31 in cr7
        beq		LShortReverse64		// no 64-byte chunks
        mtctr	w2
        b		1f

// Loop over 64-byte chunks (reverse).

        .align	5
1:
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ld		w4,-32(rs)
        ld		w5,-40(rs)
        ld		w6,-48(rs)
        ld		w7,-56(rs)
        ldu		w8,-64(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        std		w4,-32(rd)
        std		w5,-40(rd)
        std		w6,-48(rd)
        std		w7,-56(rd)
        stdu	w8,-64(rd)
        bdnz	1b
        
        b		LShortReverse64

	COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
8f6c56a5	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
55e303ae	5	*
8f6c56a5 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
8ad349bb	24	* limitations under the License.
8f6c56a5 A	25	*
8f6c56a5 A	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
55e303ae A	27	*/
	28	/* =======================================
	29	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	30	* =======================================
	31	*
	32	* Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
	33	* This version might be used bringing up new processors, with known
	34	* Altivec bugs that need to be worked around. It is not particularly well
	35	* optimized.
	36	*
91447636 A	37	* For 64-bit processors with a 128-byte cache line, running in either
	38	* 32- or 64-bit mode. This is written for 32-bit execution, the kernel
	39	* will translate to 64-bit code when it compiles the 64-bit commpage.
	40	*
55e303ae A	41	* Register usage. Note we use R2, so this code will not run in a PEF/CFM
	42	* environment.
	43	* r0 = "w7" or temp
	44	* r2 = "w8"
	45	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	46	* r4 = source ptr ("rs")
	47	* r5 = count of bytes to move ("rc")
	48	* r6 = "w1"
	49	* r7 = "w2"
	50	* r8 = "w3"
	51	* r9 = "w4"
	52	* r10 = "w5"
	53	* r11 = "w6"
	54	* r12 = destination ptr ("rd")
	55	*/
	56	#define rs r4
	57	#define rd r12
	58	#define rc r5
	59	#define rv r2
	60
	61	#define w1 r6
	62	#define w2 r7
	63	#define w3 r8
	64	#define w4 r9
	65	#define w5 r10
	66	#define w6 r11
	67	#define w7 r0
	68	#define w8 r2
	69
	70	#define ASSEMBLER
	71	#include <sys/appleapiopts.h>
	72	#include <ppc/asm.h>
	73	#include <machine/cpu_capabilities.h>
	74	#include <machine/commpage.h>
	75
	76	.text
55e303ae A	77
	78	#define kLong 64 // too long for inline loopless code
	79
	80
	81	// Main entry points.
	82
	83	.align 5
	84	bcopy_64: // void bcopy(const void src, void dst, size_t len)
	85	cmplwi rc,kLong // short or long?
	86	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
	87	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
	88	mr rd,r4 // start to move registers to canonic spot
	89	mr rs,r3
	90	blt LShort // handle short operands
	91	dcbt 0,r3 // touch in destination
	92	b LLong // join medium/long operand code
	93
	94	// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
	95
	96	.align 5
	97	Lmemcpy_g4: // void* memcpy(void dst, void src, size_t len)
	98	Lmemmove_g4: // void* memmove(void dst, const void src, size_t len)
	99	cmplwi rc,kLong // short or long?
	100	sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
	101	dcbt 0,r4 // touch in the first line of source
	102	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
	103	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
	104	bge LLong // handle medium or long operands
	105
	106	// Handle short operands.
	107
	108	LShort:
	109	mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time)
	110	mtcrf 0x01,rc // put length bits 28-31 in cr7
	111	blt cr1,LShortReverse
	112
	113	// Forward short operands. This is the most frequent case, so it is inline.
	114
	115	LShort64: // enter to xfer last 64 bytes
	116	bf 26,0f // 64-byte chunk to xfer?
	117	ld w1,0(rs)
	118	ld w2,8(rs)
	119	ld w3,16(rs)
	120	ld w4,24(rs)
	121	addi rs,rs,32
	122	std w1,0(rd)
	123	std w2,8(rd)
	124	std w3,16(rd)
	125	std w4,24(rd)
	126	addi rd,rd,32
	127	0:
	128	bf 27,1f // quadword to move?
	129	ld w1,0(rs)
	130	ld w2,8(rs)
	131	addi rs,rs,16
	132	std w1,0(rd)
	133	std w2,8(rd)
	134	addi rd,rd,16
	135	1:
	136	bf 28,2f // doubleword?
	137	ld w1,0(rs)
	138	addi rs,rs,8
	139	std w1,0(rd)
	140	addi rd,rd,8
141	2:
142	bf 29,3f // word?
143	lwz w1,0(rs)
144	addi rs,rs,4
145	stw w1,0(rd)
146	addi rd,rd,4
147	3:
148	bf 30,4f // halfword to move?
149	lhz w1,0(rs)
150	addi rs,rs,2
151	sth w1,0(rd)
152	addi rd,rd,2
153	4:
154	bflr 31 // skip if no odd byte
155	lbz w1,0(rs)
156	stb w1,0(rd)
157	blr
158
159
160	// Handle short reverse operands.
161	// cr6 = bits 26-27 of length
162	// cr7 = bits 28-31 of length
163
164	LShortReverse:
165	add rs,rs,rc // adjust ptrs for reverse move
166	add rd,rd,rc
167	LShortReverse64: // enter to xfer last 64 bytes
168	bf 26,0f // 64-byte chunk to xfer?
169	ld w1,-8(rs)
170	ld w2,-16(rs)
171	ld w3,-24(rs)
172	ldu w4,-32(rs)
173	std w1,-8(rd)
174	std w2,-16(rd)
175	std w3,-24(rd)
176	stdu w4,-32(rd)
177	0:
178	bf 27,1f // quadword to move?
179	ld w1,-8(rs)
180	ldu w2,-16(rs)
181	std w1,-8(rd)
182	stdu w2,-16(rd)
183	1:
184	bf 28,2f // doubleword?
185	ldu w1,-8(rs)
186	stdu w1,-8(rd)
187	2:
188	bf 29,3f // word?
189	lwzu w1,-4(rs)
190	stwu w1,-4(rd)
191	3:
192	bf 30,4f // halfword to move?
193	lhzu w1,-2(rs)
194	sthu w1,-2(rd)
195	4:
196	bflr 31 // done if no odd byte
197	lbz w1,-1(rs) // no update
198	stb w1,-1(rd)
199	blr
200
201
202	// Long operands.
203	// cr1 = blt iff we must move reverse
204
205	.align 4
206	LLong:
207	dcbtst 0,rd // touch in destination
208	neg w3,rd // start to compute #bytes to align destination
209	andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
210	blt cr1,LLongReverse // handle reverse moves
211	mtctr w6 // set up for loop to align destination
212	sub rc,rc,w6 // adjust count
213	beq LAligned // destination already 8-byte aligned
214	1:
215	lbz w1,0(rs)
216	addi rs,rs,1
217	stb w1,0(rd)
218	addi rd,rd,1
219	bdnz 1b
220
221	// Destination is 8-byte aligned.
222
223	LAligned:
224	srwi. w2,rc,6 // w2 <- count of 64-byte chunks
225	mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
226	mtcrf 0x01,rc // put length bits 28-31 in cr7
227	beq LShort64 // no 64-byte chunks
228	mtctr w2
229	b 1f
230
231	// Loop moving 64-byte chunks.
232
233	.align 5
234	1:
235	ld w1,0(rs)
236	ld w2,8(rs)
237	ld w3,16(rs)
238	ld w4,24(rs)
239	ld w5,32(rs)
240	ld w6,40(rs)
241	ld w7,48(rs)
242	ld w8,56(rs)
243	addi rs,rs,64
244	std w1,0(rd)
245	std w2,8(rd)
246	std w3,16(rd)
247	std w4,24(rd)
248	std w5,32(rd)
249	std w6,40(rd)
250	std w7,48(rd)
251	std w8,56(rd)
252	addi rd,rd,64
253	bdnz 1b
254
255	b LShort64
256
257
258	// Handle reverse moves.
259
260	LLongReverse:
261	add rd,rd,rc // point to end of operands
262	add rs,rs,rc
263	andi. r0,rd,7 // is destination 8-byte aligned?
264	sub rc,rc,r0 // adjust count
265	mtctr r0 // set up for byte loop
266	beq LRevAligned // already aligned
267
268	1:
269	lbzu w1,-1(rs)
270	stbu w1,-1(rd)
271	bdnz 1b
272
273	// Destination is 8-byte aligned.
274
275	LRevAligned:
276	srwi. w2,rc,6 // w2 <- count of 64-byte chunks
277	mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
278	mtcrf 0x01,rc // put length bits 28-31 in cr7
279	beq LShortReverse64 // no 64-byte chunks
280	mtctr w2
281	b 1f
282
283	// Loop over 64-byte chunks (reverse).
284
285	.align 5
286	1:
287	ld w1,-8(rs)
288	ld w2,-16(rs)
289	ld w3,-24(rs)
290	ld w4,-32(rs)
291	ld w5,-40(rs)
292	ld w6,-48(rs)
293	ld w7,-56(rs)
294	ldu w8,-64(rs)
295	std w1,-8(rd)
296	std w2,-16(rd)
297	std w3,-24(rd)
298	std w4,-32(rd)
299	std w5,-40(rd)
300	std w6,-48(rd)
301	std w7,-56(rd)
302	stdu w8,-64(rd)
303	bdnz 1b
304
305	b LShortReverse64
306
91447636	307	COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)