[apple/xnu.git] / osfmk / ppc / commpage / bcopy_64.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
 * This version might be used bringing up new processors, with known
 * Altivec bugs that need to be worked around.  It is not particularly well
 * optimized.
 *
 * For 64-bit processors with a 128-byte cache line, running in either 
 * 32- or 64-bit mode.  This is written for 32-bit execution, the kernel
 * will translate to 64-bit code when it compiles the 64-bit commpage.
 *
 * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
 * environment.
 *   r0  = "w7" or temp
 *   r2  = "w8"
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1"
 *   r7  = "w2"
 *   r8  = "w3"
 *   r9  = "w4"
 *   r10 = "w5"
 *   r11 = "w6"
 *   r12 = destination ptr ("rd")
 */
#define rs	r4
#define rd	r12
#define rc	r5
#define	rv	r2

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9
#define	w5	r10
#define	w6	r11
#define	w7	r0
#define	w8	r2

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text

#define	kLong		64				// too long for inline loopless code


// Main entry points.

        .align 	5
bcopy_64:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kLong			// short or long?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        mr		rd,r4				// start to move registers to canonic spot
        mr		rs,r3
        blt		LShort				// handle short operands
        dcbt	0,r3				// touch in destination
        b		LLong				// join medium/long operand code

// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
        
        .align	5
Lmemcpy_g4:							// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_g4:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kLong			// short or long?
        sub		w1,r3,r4			// must move in reverse if (rd-rs)<rc
        dcbt	0,r4				// touch in the first line of source
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong				// handle medium or long operands

// Handle short operands.
        
LShort:
        mtcrf	0x02,rc				// put length bits 26-27 in cr6 (faster one cr at a time)
        mtcrf	0x01,rc				// put length bits 28-31 in cr7
        blt		cr1,LShortReverse
        
// Forward short operands.  This is the most frequent case, so it is inline.

LShort64:							// enter to xfer last 64 bytes
        bf		26,0f				// 64-byte chunk to xfer?
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
0:
        bf		27,1f				// quadword to move?
        ld		w1,0(rs)
        ld		w2,8(rs)
        addi	rs,rs,16
        std		w1,0(rd)
        std		w2,8(rd)
        addi	rd,rd,16
1:
        bf		28,2f				// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		30,4f				// halfword to move?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
4:
        bflr	31					// skip if no odd byte
        lbz		w1,0(rs)
        stb		w1,0(rd)
        blr
        
        
// Handle short reverse operands.
//		cr6 = bits 26-27 of length
//		cr7 = bits 28-31 of length      

LShortReverse:
        add		rs,rs,rc			// adjust ptrs for reverse move
        add		rd,rd,rc
LShortReverse64:					// enter to xfer last 64 bytes
        bf		26,0f				// 64-byte chunk to xfer?
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ldu		w4,-32(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        stdu	w4,-32(rd)
0:
        bf		27,1f				// quadword to move?
        ld		w1,-8(rs)
        ldu		w2,-16(rs)
        std		w1,-8(rd)
        stdu	w2,-16(rd)
1:
        bf		28,2f				// doubleword?
        ldu		w1,-8(rs)
        stdu	w1,-8(rd)
2:
        bf		29,3f				// word?
        lwzu	w1,-4(rs)
        stwu	w1,-4(rd)
3:
        bf		30,4f				// halfword to move?
        lhzu	w1,-2(rs)
        sthu	w1,-2(rd)
4:
        bflr	31					// done if no odd byte
        lbz 	w1,-1(rs)			// no update
        stb 	w1,-1(rd)
        blr
        

// Long operands.
//     cr1 = blt iff we must move reverse

        .align	4
LLong:
        dcbtst	0,rd				// touch in destination
        neg		w3,rd				// start to compute #bytes to align destination
        andi.	w6,w3,7				// w6 <- #bytes to 8-byte align destination
        blt		cr1,LLongReverse	// handle reverse moves
        mtctr	w6					// set up for loop to align destination
        sub		rc,rc,w6			// adjust count
        beq		LAligned			// destination already 8-byte aligned
1:
        lbz		w1,0(rs)
        addi	rs,rs,1
        stb		w1,0(rd)
        addi	rd,rd,1
        bdnz	1b
        
// Destination is 8-byte aligned.

LAligned:
        srwi.	w2,rc,6				// w2 <- count of 64-byte chunks
        mtcrf	0x02,rc				// leftover byte count to cr (faster one cr at a time)
        mtcrf	0x01,rc				// put length bits 28-31 in cr7
        beq		LShort64			// no 64-byte chunks
        mtctr	w2
        b		1f
        
// Loop moving 64-byte chunks.

        .align	5
1:
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        ld		w5,32(rs)
        ld		w6,40(rs)
        ld		w7,48(rs)
        ld		w8,56(rs)
        addi	rs,rs,64
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        std		w5,32(rd)
        std		w6,40(rd)
        std		w7,48(rd)
        std		w8,56(rd)
        addi	rd,rd,64
        bdnz	1b
        
        b		LShort64

        
// Handle reverse moves.

LLongReverse:
        add		rd,rd,rc				// point to end of operands
        add		rs,rs,rc
        andi.	r0,rd,7					// is destination 8-byte aligned?
        sub		rc,rc,r0				// adjust count
        mtctr	r0						// set up for byte loop
        beq		LRevAligned				// already aligned
        
1:
        lbzu	w1,-1(rs)
        stbu	w1,-1(rd)
        bdnz	1b

// Destination is 8-byte aligned.

LRevAligned:
        srwi.	w2,rc,6				// w2 <- count of 64-byte chunks
        mtcrf	0x02,rc				// leftover byte count to cr (faster one cr at a time)
        mtcrf	0x01,rc				// put length bits 28-31 in cr7
        beq		LShortReverse64		// no 64-byte chunks
        mtctr	w2
        b		1f

// Loop over 64-byte chunks (reverse).

        .align	5
1:
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ld		w4,-32(rs)
        ld		w5,-40(rs)
        ld		w6,-48(rs)
        ld		w7,-56(rs)
        ldu		w8,-64(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        std		w4,-32(rd)
        std		w5,-40(rd)
        std		w6,-48(rd)
        std		w7,-56(rd)
        stdu	w8,-64(rd)
        bdnz	1b
        
        b		LShortReverse64

	COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
6601e61a	4	* @APPLE_LICENSE_HEADER_START@
55e303ae	5	*
6601e61a A	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
8f6c56a5	11	*
6601e61a A	12	* This Original Code and all software distributed under the License are
6601e61a A	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
6601e61a A	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
8f6c56a5	19	*
6601e61a	20	* @APPLE_LICENSE_HEADER_END@
55e303ae A	21	*/
	22	/* =======================================
	23	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	24	* =======================================
	25	*
	26	* Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
	27	* This version might be used bringing up new processors, with known
	28	* Altivec bugs that need to be worked around. It is not particularly well
	29	* optimized.
	30	*
91447636 A	31	* For 64-bit processors with a 128-byte cache line, running in either
	32	* 32- or 64-bit mode. This is written for 32-bit execution, the kernel
	33	* will translate to 64-bit code when it compiles the 64-bit commpage.
	34	*
55e303ae A	35	* Register usage. Note we use R2, so this code will not run in a PEF/CFM
	36	* environment.
	37	* r0 = "w7" or temp
	38	* r2 = "w8"
	39	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	40	* r4 = source ptr ("rs")
	41	* r5 = count of bytes to move ("rc")
	42	* r6 = "w1"
	43	* r7 = "w2"
	44	* r8 = "w3"
	45	* r9 = "w4"
	46	* r10 = "w5"
	47	* r11 = "w6"
	48	* r12 = destination ptr ("rd")
	49	*/
	50	#define rs r4
	51	#define rd r12
	52	#define rc r5
	53	#define rv r2
	54
	55	#define w1 r6
	56	#define w2 r7
	57	#define w3 r8
	58	#define w4 r9
	59	#define w5 r10
	60	#define w6 r11
	61	#define w7 r0
	62	#define w8 r2
	63
	64	#define ASSEMBLER
	65	#include <sys/appleapiopts.h>
	66	#include <ppc/asm.h>
	67	#include <machine/cpu_capabilities.h>
	68	#include <machine/commpage.h>
	69
	70	.text
55e303ae A	71
	72	#define kLong 64 // too long for inline loopless code
	73
	74
	75	// Main entry points.
	76
	77	.align 5
	78	bcopy_64: // void bcopy(const void src, void dst, size_t len)
	79	cmplwi rc,kLong // short or long?
	80	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
	81	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
	82	mr rd,r4 // start to move registers to canonic spot
	83	mr rs,r3
	84	blt LShort // handle short operands
	85	dcbt 0,r3 // touch in destination
	86	b LLong // join medium/long operand code
	87
	88	// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
	89
	90	.align 5
	91	Lmemcpy_g4: // void* memcpy(void dst, void src, size_t len)
	92	Lmemmove_g4: // void* memmove(void dst, const void src, size_t len)
	93	cmplwi rc,kLong // short or long?
	94	sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
	95	dcbt 0,r4 // touch in the first line of source
	96	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
	97	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
	98	bge LLong // handle medium or long operands
	99
	100	// Handle short operands.
	101
	102	LShort:
	103	mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time)
	104	mtcrf 0x01,rc // put length bits 28-31 in cr7
	105	blt cr1,LShortReverse
	106
	107	// Forward short operands. This is the most frequent case, so it is inline.
	108
	109	LShort64: // enter to xfer last 64 bytes
	110	bf 26,0f // 64-byte chunk to xfer?
	111	ld w1,0(rs)
	112	ld w2,8(rs)
	113	ld w3,16(rs)
	114	ld w4,24(rs)
	115	addi rs,rs,32
	116	std w1,0(rd)
	117	std w2,8(rd)
	118	std w3,16(rd)
	119	std w4,24(rd)
	120	addi rd,rd,32
	121	0:
	122	bf 27,1f // quadword to move?
	123	ld w1,0(rs)
	124	ld w2,8(rs)
	125	addi rs,rs,16
	126	std w1,0(rd)
	127	std w2,8(rd)
	128	addi rd,rd,16
	129	1:
	130	bf 28,2f // doubleword?
	131	ld w1,0(rs)
	132	addi rs,rs,8
	133	std w1,0(rd)
	134	addi rd,rd,8
135	2:
136	bf 29,3f // word?
137	lwz w1,0(rs)
138	addi rs,rs,4
139	stw w1,0(rd)
140	addi rd,rd,4
141	3:
142	bf 30,4f // halfword to move?
143	lhz w1,0(rs)
144	addi rs,rs,2
145	sth w1,0(rd)
146	addi rd,rd,2
147	4:
148	bflr 31 // skip if no odd byte
149	lbz w1,0(rs)
150	stb w1,0(rd)
151	blr
152
153
154	// Handle short reverse operands.
155	// cr6 = bits 26-27 of length
156	// cr7 = bits 28-31 of length
157
158	LShortReverse:
159	add rs,rs,rc // adjust ptrs for reverse move
160	add rd,rd,rc
161	LShortReverse64: // enter to xfer last 64 bytes
162	bf 26,0f // 64-byte chunk to xfer?
163	ld w1,-8(rs)
164	ld w2,-16(rs)
165	ld w3,-24(rs)
166	ldu w4,-32(rs)
167	std w1,-8(rd)
168	std w2,-16(rd)
169	std w3,-24(rd)
170	stdu w4,-32(rd)
171	0:
172	bf 27,1f // quadword to move?
173	ld w1,-8(rs)
174	ldu w2,-16(rs)
175	std w1,-8(rd)
176	stdu w2,-16(rd)
177	1:
178	bf 28,2f // doubleword?
179	ldu w1,-8(rs)
180	stdu w1,-8(rd)
181	2:
182	bf 29,3f // word?
183	lwzu w1,-4(rs)
184	stwu w1,-4(rd)
185	3:
186	bf 30,4f // halfword to move?
187	lhzu w1,-2(rs)
188	sthu w1,-2(rd)
189	4:
190	bflr 31 // done if no odd byte
191	lbz w1,-1(rs) // no update
192	stb w1,-1(rd)
193	blr
194
195
196	// Long operands.
197	// cr1 = blt iff we must move reverse
198
199	.align 4
200	LLong:
201	dcbtst 0,rd // touch in destination
202	neg w3,rd // start to compute #bytes to align destination
203	andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
204	blt cr1,LLongReverse // handle reverse moves
205	mtctr w6 // set up for loop to align destination
206	sub rc,rc,w6 // adjust count
207	beq LAligned // destination already 8-byte aligned
208	1:
209	lbz w1,0(rs)
210	addi rs,rs,1
211	stb w1,0(rd)
212	addi rd,rd,1
213	bdnz 1b
214
215	// Destination is 8-byte aligned.
216
217	LAligned:
218	srwi. w2,rc,6 // w2 <- count of 64-byte chunks
219	mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
220	mtcrf 0x01,rc // put length bits 28-31 in cr7
221	beq LShort64 // no 64-byte chunks
222	mtctr w2
223	b 1f
224
225	// Loop moving 64-byte chunks.
226
227	.align 5
228	1:
229	ld w1,0(rs)
230	ld w2,8(rs)
231	ld w3,16(rs)
232	ld w4,24(rs)
233	ld w5,32(rs)
234	ld w6,40(rs)
235	ld w7,48(rs)
236	ld w8,56(rs)
237	addi rs,rs,64
238	std w1,0(rd)
239	std w2,8(rd)
240	std w3,16(rd)
241	std w4,24(rd)
242	std w5,32(rd)
243	std w6,40(rd)
244	std w7,48(rd)
245	std w8,56(rd)
246	addi rd,rd,64
247	bdnz 1b
248
249	b LShort64
250
251
252	// Handle reverse moves.
253
254	LLongReverse:
255	add rd,rd,rc // point to end of operands
256	add rs,rs,rc
257	andi. r0,rd,7 // is destination 8-byte aligned?
258	sub rc,rc,r0 // adjust count
259	mtctr r0 // set up for byte loop
260	beq LRevAligned // already aligned
261
262	1:
263	lbzu w1,-1(rs)
264	stbu w1,-1(rd)
265	bdnz 1b
266
267	// Destination is 8-byte aligned.
268
269	LRevAligned:
270	srwi. w2,rc,6 // w2 <- count of 64-byte chunks
271	mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
272	mtcrf 0x01,rc // put length bits 28-31 in cr7
273	beq LShortReverse64 // no 64-byte chunks
274	mtctr w2
275	b 1f
276
277	// Loop over 64-byte chunks (reverse).
278
279	.align 5
280	1:
281	ld w1,-8(rs)
282	ld w2,-16(rs)
283	ld w3,-24(rs)
284	ld w4,-32(rs)
285	ld w5,-40(rs)
286	ld w6,-48(rs)
287	ld w7,-56(rs)
288	ldu w8,-64(rs)
289	std w1,-8(rd)
290	std w2,-16(rd)
291	std w3,-24(rd)
292	std w4,-32(rd)
293	std w5,-40(rd)
294	std w6,-48(rd)
295	std w7,-56(rd)
296	stdu w8,-64(rd)
297	bdnz 1b
298
299	b LShortReverse64
300
91447636	301	COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)