[apple/xnu.git] / osfmk / ppc / commpage / bcopy_64.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
 * This version might be used bringing up new processors, with known
 * Altivec bugs that need to be worked around.  It is not particularly well
 * optimized.
 *
 * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
 * environment.
 *   r0  = "w7" or temp
 *   r2  = "w8"
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1"
 *   r7  = "w2"
 *   r8  = "w3"
 *   r9  = "w4"
 *   r10 = "w5"
 *   r11 = "w6"
 *   r12 = destination ptr ("rd")
 */
#define rs	r4
#define rd	r12
#define rc	r5
#define	rv	r2

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9
#define	w5	r10
#define	w6	r11
#define	w7	r0
#define	w8	r2

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
        .globl 	EXT(bcopy_64)

#define	kLong		64				// too long for inline loopless code


// Main entry points.

        .align 	5
bcopy_64:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kLong			// short or long?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        mr		rd,r4				// start to move registers to canonic spot
        mr		rs,r3
        blt		LShort				// handle short operands
        dcbt	0,r3				// touch in destination
        b		LLong				// join medium/long operand code

// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
        
        .align	5
Lmemcpy_g4:							// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_g4:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kLong			// short or long?
        sub		w1,r3,r4			// must move in reverse if (rd-rs)<rc
        dcbt	0,r4				// touch in the first line of source
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong				// handle medium or long operands

// Handle short operands.
        
LShort:
        mtcrf	0x02,rc				// put length bits 26-27 in cr6 (faster one cr at a time)
        mtcrf	0x01,rc				// put length bits 28-31 in cr7
        blt		cr1,LShortReverse
        
// Forward short operands.  This is the most frequent case, so it is inline.

LShort64:							// enter to xfer last 64 bytes
        bf		26,0f				// 64-byte chunk to xfer?
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
0:
        bf		27,1f				// quadword to move?
        ld		w1,0(rs)
        ld		w2,8(rs)
        addi	rs,rs,16
        std		w1,0(rd)
        std		w2,8(rd)
        addi	rd,rd,16
1:
        bf		28,2f				// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		30,4f				// halfword to move?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
4:
        bflr	31					// skip if no odd byte
        lbz		w1,0(rs)
        stb		w1,0(rd)
        blr
        
        
// Handle short reverse operands.
//		cr6 = bits 26-27 of length
//		cr7 = bits 28-31 of length      

LShortReverse:
        add		rs,rs,rc			// adjust ptrs for reverse move
        add		rd,rd,rc
LShortReverse64:					// enter to xfer last 64 bytes
        bf		26,0f				// 64-byte chunk to xfer?
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ldu		w4,-32(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        stdu	w4,-32(rd)
0:
        bf		27,1f				// quadword to move?
        ld		w1,-8(rs)
        ldu		w2,-16(rs)
        std		w1,-8(rd)
        stdu	w2,-16(rd)
1:
        bf		28,2f				// doubleword?
        ldu		w1,-8(rs)
        stdu	w1,-8(rd)
2:
        bf		29,3f				// word?
        lwzu	w1,-4(rs)
        stwu	w1,-4(rd)
3:
        bf		30,4f				// halfword to move?
        lhzu	w1,-2(rs)
        sthu	w1,-2(rd)
4:
        bflr	31					// done if no odd byte
        lbz 	w1,-1(rs)			// no update
        stb 	w1,-1(rd)
        blr
        

// Long operands.
//     cr1 = blt iff we must move reverse

        .align	4
LLong:
        dcbtst	0,rd				// touch in destination
        neg		w3,rd				// start to compute #bytes to align destination
        andi.	w6,w3,7				// w6 <- #bytes to 8-byte align destination
        blt		cr1,LLongReverse	// handle reverse moves
        mtctr	w6					// set up for loop to align destination
        sub		rc,rc,w6			// adjust count
        beq		LAligned			// destination already 8-byte aligned
1:
        lbz		w1,0(rs)
        addi	rs,rs,1
        stb		w1,0(rd)
        addi	rd,rd,1
        bdnz	1b
        
// Destination is 8-byte aligned.

LAligned:
        srwi.	w2,rc,6				// w2 <- count of 64-byte chunks
        mtcrf	0x02,rc				// leftover byte count to cr (faster one cr at a time)
        mtcrf	0x01,rc				// put length bits 28-31 in cr7
        beq		LShort64			// no 64-byte chunks
        mtctr	w2
        b		1f
        
// Loop moving 64-byte chunks.

        .align	5
1:
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        ld		w5,32(rs)
        ld		w6,40(rs)
        ld		w7,48(rs)
        ld		w8,56(rs)
        addi	rs,rs,64
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        std		w5,32(rd)
        std		w6,40(rd)
        std		w7,48(rd)
        std		w8,56(rd)
        addi	rd,rd,64
        bdnz	1b
        
        b		LShort64

        
// Handle reverse moves.

LLongReverse:
        add		rd,rd,rc				// point to end of operands
        add		rs,rs,rc
        andi.	r0,rd,7					// is destination 8-byte aligned?
        sub		rc,rc,r0				// adjust count
        mtctr	r0						// set up for byte loop
        beq		LRevAligned				// already aligned
        
1:
        lbzu	w1,-1(rs)
        stbu	w1,-1(rd)
        bdnz	1b

// Destination is 8-byte aligned.

LRevAligned:
        srwi.	w2,rc,6				// w2 <- count of 64-byte chunks
        mtcrf	0x02,rc				// leftover byte count to cr (faster one cr at a time)
        mtcrf	0x01,rc				// put length bits 28-31 in cr7
        beq		LShortReverse64		// no 64-byte chunks
        mtctr	w2
        b		1f

// Loop over 64-byte chunks (reverse).

        .align	5
1:
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ld		w4,-32(rs)
        ld		w5,-40(rs)
        ld		w6,-48(rs)
        ld		w7,-56(rs)
        ldu		w8,-64(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        std		w4,-32(rd)
        std		w5,-40(rd)
        std		w6,-48(rd)
        std		w7,-56(rd)
        stdu	w8,-64(rd)
        bdnz	1b
        
        b		LShortReverse64

        COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,0)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
e5568f75 A	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
55e303ae	11	*
e5568f75 A	12	* This Original Code and all software distributed under the License are
e5568f75 A	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
55e303ae A	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
55e303ae A	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
e5568f75 A	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
55e303ae A	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22	/* =======================================
	23	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	24	* =======================================
	25	*
	26	* Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
	27	* This version might be used bringing up new processors, with known
	28	* Altivec bugs that need to be worked around. It is not particularly well
	29	* optimized.
	30	*
	31	* Register usage. Note we use R2, so this code will not run in a PEF/CFM
	32	* environment.
	33	* r0 = "w7" or temp
	34	* r2 = "w8"
	35	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	36	* r4 = source ptr ("rs")
	37	* r5 = count of bytes to move ("rc")
	38	* r6 = "w1"
	39	* r7 = "w2"
	40	* r8 = "w3"
	41	* r9 = "w4"
	42	* r10 = "w5"
	43	* r11 = "w6"
	44	* r12 = destination ptr ("rd")
	45	*/
	46	#define rs r4
	47	#define rd r12
	48	#define rc r5
	49	#define rv r2
	50
	51	#define w1 r6
	52	#define w2 r7
	53	#define w3 r8
	54	#define w4 r9
	55	#define w5 r10
	56	#define w6 r11
	57	#define w7 r0
	58	#define w8 r2
	59
	60	#define ASSEMBLER
	61	#include <sys/appleapiopts.h>
	62	#include <ppc/asm.h>
	63	#include <machine/cpu_capabilities.h>
	64	#include <machine/commpage.h>
	65
	66	.text
	67	.globl EXT(bcopy_64)
	68
	69	#define kLong 64 // too long for inline loopless code
	70
	71
	72	// Main entry points.
	73
	74	.align 5
	75	bcopy_64: // void bcopy(const void src, void dst, size_t len)
	76	cmplwi rc,kLong // short or long?
	77	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
	78	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
	79	mr rd,r4 // start to move registers to canonic spot
	80	mr rs,r3
	81	blt LShort // handle short operands
	82	dcbt 0,r3 // touch in destination
83	b LLong // join medium/long operand code
84
85	// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
86
87	.align 5
88	Lmemcpy_g4: // void* memcpy(void dst, void src, size_t len)
89	Lmemmove_g4: // void* memmove(void dst, const void src, size_t len)
90	cmplwi rc,kLong // short or long?
91	sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
92	dcbt 0,r4 // touch in the first line of source
93	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
94	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
95	bge LLong // handle medium or long operands
96
97	// Handle short operands.
98
99	LShort:
100	mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time)
101	mtcrf 0x01,rc // put length bits 28-31 in cr7
102	blt cr1,LShortReverse
103
104	// Forward short operands. This is the most frequent case, so it is inline.
105
106	LShort64: // enter to xfer last 64 bytes
107	bf 26,0f // 64-byte chunk to xfer?
108	ld w1,0(rs)
109	ld w2,8(rs)
110	ld w3,16(rs)
111	ld w4,24(rs)
112	addi rs,rs,32
113	std w1,0(rd)
114	std w2,8(rd)
115	std w3,16(rd)
116	std w4,24(rd)
117	addi rd,rd,32
118	0:
119	bf 27,1f // quadword to move?
120	ld w1,0(rs)
121	ld w2,8(rs)
122	addi rs,rs,16
123	std w1,0(rd)
124	std w2,8(rd)
125	addi rd,rd,16
126	1:
127	bf 28,2f // doubleword?
128	ld w1,0(rs)
129	addi rs,rs,8
130	std w1,0(rd)
131	addi rd,rd,8
132	2:
133	bf 29,3f // word?
134	lwz w1,0(rs)
135	addi rs,rs,4
136	stw w1,0(rd)
137	addi rd,rd,4
138	3:
139	bf 30,4f // halfword to move?
140	lhz w1,0(rs)
141	addi rs,rs,2
142	sth w1,0(rd)
143	addi rd,rd,2
144	4:
145	bflr 31 // skip if no odd byte
146	lbz w1,0(rs)
147	stb w1,0(rd)
148	blr
149
150
151	// Handle short reverse operands.
152	// cr6 = bits 26-27 of length
153	// cr7 = bits 28-31 of length
154
155	LShortReverse:
156	add rs,rs,rc // adjust ptrs for reverse move
157	add rd,rd,rc
158	LShortReverse64: // enter to xfer last 64 bytes
159	bf 26,0f // 64-byte chunk to xfer?
160	ld w1,-8(rs)
161	ld w2,-16(rs)
162	ld w3,-24(rs)
163	ldu w4,-32(rs)
164	std w1,-8(rd)
165	std w2,-16(rd)
166	std w3,-24(rd)
167	stdu w4,-32(rd)
168	0:
169	bf 27,1f // quadword to move?
170	ld w1,-8(rs)
171	ldu w2,-16(rs)
172	std w1,-8(rd)
173	stdu w2,-16(rd)
174	1:
175	bf 28,2f // doubleword?
176	ldu w1,-8(rs)
177	stdu w1,-8(rd)
178	2:
179	bf 29,3f // word?
180	lwzu w1,-4(rs)
181	stwu w1,-4(rd)
182	3:
183	bf 30,4f // halfword to move?
184	lhzu w1,-2(rs)
185	sthu w1,-2(rd)
186	4:
187	bflr 31 // done if no odd byte
188	lbz w1,-1(rs) // no update
189	stb w1,-1(rd)
190	blr
191
192
193	// Long operands.
194	// cr1 = blt iff we must move reverse
195
196	.align 4
197	LLong:
198	dcbtst 0,rd // touch in destination
199	neg w3,rd // start to compute #bytes to align destination
200	andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
201	blt cr1,LLongReverse // handle reverse moves
202	mtctr w6 // set up for loop to align destination
203	sub rc,rc,w6 // adjust count
204	beq LAligned // destination already 8-byte aligned
205	1:
206	lbz w1,0(rs)
207	addi rs,rs,1
208	stb w1,0(rd)
209	addi rd,rd,1
210	bdnz 1b
211
212	// Destination is 8-byte aligned.
213
214	LAligned:
215	srwi. w2,rc,6 // w2 <- count of 64-byte chunks
216	mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
217	mtcrf 0x01,rc // put length bits 28-31 in cr7
218	beq LShort64 // no 64-byte chunks
219	mtctr w2
220	b 1f
221
222	// Loop moving 64-byte chunks.
223
224	.align 5
225	1:
226	ld w1,0(rs)
227	ld w2,8(rs)
228	ld w3,16(rs)
229	ld w4,24(rs)
230	ld w5,32(rs)
231	ld w6,40(rs)
232	ld w7,48(rs)
233	ld w8,56(rs)
234	addi rs,rs,64
235	std w1,0(rd)
236	std w2,8(rd)
237	std w3,16(rd)
238	std w4,24(rd)
239	std w5,32(rd)
240	std w6,40(rd)
241	std w7,48(rd)
242	std w8,56(rd)
243	addi rd,rd,64
244	bdnz 1b
245
246	b LShort64
247
248
249	// Handle reverse moves.
250
251	LLongReverse:
252	add rd,rd,rc // point to end of operands
253	add rs,rs,rc
254	andi. r0,rd,7 // is destination 8-byte aligned?
255	sub rc,rc,r0 // adjust count
256	mtctr r0 // set up for byte loop
257	beq LRevAligned // already aligned
258
259	1:
260	lbzu w1,-1(rs)
261	stbu w1,-1(rd)
262	bdnz 1b
263
264	// Destination is 8-byte aligned.
265
266	LRevAligned:
267	srwi. w2,rc,6 // w2 <- count of 64-byte chunks
268	mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
269	mtcrf 0x01,rc // put length bits 28-31 in cr7
270	beq LShortReverse64 // no 64-byte chunks
271	mtctr w2
272	b 1f
273
274	// Loop over 64-byte chunks (reverse).
275
276	.align 5
277	1:
278	ld w1,-8(rs)
279	ld w2,-16(rs)
280	ld w3,-24(rs)
281	ld w4,-32(rs)
282	ld w5,-40(rs)
283	ld w6,-48(rs)
284	ld w7,-56(rs)
285	ldu w8,-64(rs)
286	std w1,-8(rd)
287	std w2,-16(rd)
288	std w3,-24(rd)
289	std w4,-32(rd)
290	std w5,-40(rd)
291	std w6,-48(rd)
292	std w7,-56(rd)
293	stdu w8,-64(rd)
294	bdnz 1b
295
296	b LShortReverse64
297
298	COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,0)