git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22	/* =======================================
	23	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	24	* =======================================
	25	*
	26	* Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
	27	* This version might be used bringing up new processors, with known
	28	* Altivec bugs that need to be worked around. It is not particularly well
	29	* optimized.
	30	*
	31	* For 64-bit processors with a 128-byte cache line, running in either
	32	* 32- or 64-bit mode. This is written for 32-bit execution, the kernel
	33	* will translate to 64-bit code when it compiles the 64-bit commpage.
	34	*
	35	* Register usage. Note we use R2, so this code will not run in a PEF/CFM
	36	* environment.
	37	* r0 = "w7" or temp
	38	* r2 = "w8"
	39	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	40	* r4 = source ptr ("rs")
	41	* r5 = count of bytes to move ("rc")
	42	* r6 = "w1"
	43	* r7 = "w2"
	44	* r8 = "w3"
	45	* r9 = "w4"
	46	* r10 = "w5"
	47	* r11 = "w6"
	48	* r12 = destination ptr ("rd")
	49	*/
	50	#define rs r4
	51	#define rd r12
	52	#define rc r5
	53	#define rv r2
	54
	55	#define w1 r6
	56	#define w2 r7
	57	#define w3 r8
	58	#define w4 r9
	59	#define w5 r10
	60	#define w6 r11
	61	#define w7 r0
	62	#define w8 r2
	63
	64	#define ASSEMBLER
	65	#include <sys/appleapiopts.h>
	66	#include <ppc/asm.h>
	67	#include <machine/cpu_capabilities.h>
	68	#include <machine/commpage.h>
	69
	70	.text
	71
	72	#define kLong 64 // too long for inline loopless code
	73
	74
	75	// Main entry points.
	76
	77	.align 5
	78	bcopy_64: // void bcopy(const void src, void dst, size_t len)
	79	cmplwi rc,kLong // short or long?
	80	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
	81	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
	82	mr rd,r4 // start to move registers to canonic spot
	83	mr rs,r3
	84	blt LShort // handle short operands
	85	dcbt 0,r3 // touch in destination
	86	b LLong // join medium/long operand code
	87
	88	// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
	89
	90	.align 5
	91	Lmemcpy_g4: // void* memcpy(void dst, void src, size_t len)
	92	Lmemmove_g4: // void* memmove(void dst, const void src, size_t len)
	93	cmplwi rc,kLong // short or long?
	94	sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
	95	dcbt 0,r4 // touch in the first line of source
	96	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
	97	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
	98	bge LLong // handle medium or long operands
	99
	100	// Handle short operands.
	101
	102	LShort:
	103	mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time)
	104	mtcrf 0x01,rc // put length bits 28-31 in cr7
	105	blt cr1,LShortReverse
	106
	107	// Forward short operands. This is the most frequent case, so it is inline.
	108
	109	LShort64: // enter to xfer last 64 bytes
	110	bf 26,0f // 64-byte chunk to xfer?
	111	ld w1,0(rs)
	112	ld w2,8(rs)
	113	ld w3,16(rs)
	114	ld w4,24(rs)
	115	addi rs,rs,32
	116	std w1,0(rd)
	117	std w2,8(rd)
	118	std w3,16(rd)
	119	std w4,24(rd)
	120	addi rd,rd,32
	121	0:
	122	bf 27,1f // quadword to move?
	123	ld w1,0(rs)
	124	ld w2,8(rs)
	125	addi rs,rs,16
	126	std w1,0(rd)
	127	std w2,8(rd)
	128	addi rd,rd,16
	129	1:
	130	bf 28,2f // doubleword?
	131	ld w1,0(rs)
	132	addi rs,rs,8
	133	std w1,0(rd)
	134	addi rd,rd,8
	135	2:
	136	bf 29,3f // word?
	137	lwz w1,0(rs)
	138	addi rs,rs,4
	139	stw w1,0(rd)
	140	addi rd,rd,4
	141	3:
	142	bf 30,4f // halfword to move?
	143	lhz w1,0(rs)
	144	addi rs,rs,2
	145	sth w1,0(rd)
	146	addi rd,rd,2
	147	4:
	148	bflr 31 // skip if no odd byte
	149	lbz w1,0(rs)
	150	stb w1,0(rd)
	151	blr
	152
	153
	154	// Handle short reverse operands.
	155	// cr6 = bits 26-27 of length
	156	// cr7 = bits 28-31 of length
	157
	158	LShortReverse:
	159	add rs,rs,rc // adjust ptrs for reverse move
	160	add rd,rd,rc
	161	LShortReverse64: // enter to xfer last 64 bytes
	162	bf 26,0f // 64-byte chunk to xfer?
	163	ld w1,-8(rs)
	164	ld w2,-16(rs)
	165	ld w3,-24(rs)
	166	ldu w4,-32(rs)
	167	std w1,-8(rd)
	168	std w2,-16(rd)
	169	std w3,-24(rd)
	170	stdu w4,-32(rd)
	171	0:
	172	bf 27,1f // quadword to move?
	173	ld w1,-8(rs)
	174	ldu w2,-16(rs)
	175	std w1,-8(rd)
	176	stdu w2,-16(rd)
	177	1:
	178	bf 28,2f // doubleword?
	179	ldu w1,-8(rs)
	180	stdu w1,-8(rd)
	181	2:
	182	bf 29,3f // word?
	183	lwzu w1,-4(rs)
	184	stwu w1,-4(rd)
	185	3:
	186	bf 30,4f // halfword to move?
	187	lhzu w1,-2(rs)
	188	sthu w1,-2(rd)
	189	4:
	190	bflr 31 // done if no odd byte
	191	lbz w1,-1(rs) // no update
	192	stb w1,-1(rd)
	193	blr
	194
	195
	196	// Long operands.
	197	// cr1 = blt iff we must move reverse
	198
	199	.align 4
	200	LLong:
	201	dcbtst 0,rd // touch in destination
	202	neg w3,rd // start to compute #bytes to align destination
	203	andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
	204	blt cr1,LLongReverse // handle reverse moves
	205	mtctr w6 // set up for loop to align destination
	206	sub rc,rc,w6 // adjust count
	207	beq LAligned // destination already 8-byte aligned
	208	1:
	209	lbz w1,0(rs)
	210	addi rs,rs,1
	211	stb w1,0(rd)
	212	addi rd,rd,1
	213	bdnz 1b
	214
	215	// Destination is 8-byte aligned.
	216
	217	LAligned:
	218	srwi. w2,rc,6 // w2 <- count of 64-byte chunks
	219	mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
	220	mtcrf 0x01,rc // put length bits 28-31 in cr7
	221	beq LShort64 // no 64-byte chunks
	222	mtctr w2
	223	b 1f
	224
	225	// Loop moving 64-byte chunks.
	226
	227	.align 5
	228	1:
	229	ld w1,0(rs)
	230	ld w2,8(rs)
	231	ld w3,16(rs)
	232	ld w4,24(rs)
	233	ld w5,32(rs)
	234	ld w6,40(rs)
	235	ld w7,48(rs)
	236	ld w8,56(rs)
	237	addi rs,rs,64
	238	std w1,0(rd)
	239	std w2,8(rd)
	240	std w3,16(rd)
	241	std w4,24(rd)
	242	std w5,32(rd)
	243	std w6,40(rd)
	244	std w7,48(rd)
	245	std w8,56(rd)
	246	addi rd,rd,64
	247	bdnz 1b
	248
	249	b LShort64
	250
	251
	252	// Handle reverse moves.
	253
	254	LLongReverse:
	255	add rd,rd,rc // point to end of operands
	256	add rs,rs,rc
	257	andi. r0,rd,7 // is destination 8-byte aligned?
	258	sub rc,rc,r0 // adjust count
	259	mtctr r0 // set up for byte loop
	260	beq LRevAligned // already aligned
	261
	262	1:
	263	lbzu w1,-1(rs)
	264	stbu w1,-1(rd)
	265	bdnz 1b
	266
	267	// Destination is 8-byte aligned.
	268
	269	LRevAligned:
	270	srwi. w2,rc,6 // w2 <- count of 64-byte chunks
	271	mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time)
	272	mtcrf 0x01,rc // put length bits 28-31 in cr7
	273	beq LShortReverse64 // no 64-byte chunks
	274	mtctr w2
	275	b 1f
	276
	277	// Loop over 64-byte chunks (reverse).
	278
	279	.align 5
	280	1:
	281	ld w1,-8(rs)
	282	ld w2,-16(rs)
	283	ld w3,-24(rs)
	284	ld w4,-32(rs)
	285	ld w5,-40(rs)
	286	ld w6,-48(rs)
	287	ld w7,-56(rs)
	288	ldu w8,-64(rs)
	289	std w1,-8(rd)
	290	std w2,-16(rd)
	291	std w3,-24(rd)
	292	std w4,-32(rd)
	293	std w5,-40(rd)
	294	std w6,-48(rd)
	295	std w7,-56(rd)
	296	stdu w8,-64(rd)
	297	bdnz 1b
	298
	299	b LShortReverse64
	300
	301	COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)