git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*/
	23
	24	#include <ppc/asm.h>
	25	#include <ppc/exception.h>
	26	#include <assym.s>
	27
	28	.text
	29	.align 2
	30	.globl _memset
	31	.globl _bzero
	32	.globl _bzero_nc
	33	.globl _bzero_phys
	34
	35
	36	// ***********************
	37	// * B Z E R O _ P H Y S *
	38	// ***********************
	39	//
	40	// void bzero_phys(addr64_t phys_addr, uint32_t length);
	41	//
	42	// Takes a phys addr in (r3,r4), and length in r5. We leave cache on.
	43
	44	.align 5
	45	LEXT(bzero_phys)
	46	mflr r12 // save return address
	47	rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3
	48	rlwimi r3,r4,0,0,31
	49	mr r4,r5 // put length where bzero() expects it
	50	bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11
	51	bl EXT(bzero) // use normal bzero() routine
	52	mtlr r12 // restore return
	53	b EXT(ml_restore) // restore MSR, turning DR on and SF off
	54
	55
	56	// *******************
	57	// * B Z E R O _ N C *
	58	// *******************
	59	//
	60	// void bzero_nc(char *addr, unsigned int length);
	61	//
	62	// For use with uncached memory. Doesn't seem to be used at all, so probably not
	63	// performance critical. NB: we must avoid unaligned stores, because some
	64	// machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
	65	// memory. Of course, we must also avoid dcbz.
	66
	67	LEXT(bzero_nc)
	68	cmplwi cr1,r4,20 // too short to bother with 16-byte loops?
	69	cmplwi cr7,r4,0 // check for (len==0)
	70	li r6,0 // get a 0
	71	bge cr1,bznc1 // skip if length >=20
	72	mtctr r4 // set up byte loop
	73	beqlr-- cr7 // done if len=0
	74
	75	// Short operands, loop over bytes.
	76
	77	bznc0:
	78	stb r6,0(r3)
	79	addi r3,r3,1
	80	bdnz bznc0
	81	blr
	82
	83	// Handle operands long enough to do doubleword stores; we must doubleword
	84	// align, to avoid alignment exceptions.
	85
	86	bznc1:
	87	neg r7,r3 // start to compute #bytes to align
	88	mfsprg r10,2 // get feature flags
	89	andi. r0,r7,7 // get #bytes to doubleword align
	90	mr r5,r3 // make copy of operand ptr as bcopy expects
	91	mtcrf 0x02,r10 // put pf64Bitb etc in cr6
	92	beq bzero_tail // already doubleword aligned
	93	sub r4,r4,r0 // adjust count
	94	mtctr r0 // set up loop
	95	bznc2: // zero bytes until doubleword aligned
	96	stb r6,0(r5)
	97	addi r5,r5,1
	98	bdnz bznc2
	99	b bzero_tail // join bzero, now that r5 is aligned
	100
	101
	102	// *********** *************
	103	// * B Z E R O * and * M E M S E T *
	104	// *********** *************
	105	//
	106	// void * memset(void *b, int c, size_t len);
	107	// void bzero(void *b, size_t len);
	108	//
	109	// These routines support G3, G4, and the 970, and run in both 32 and
	110	// 64-bit mode. Lengths (size_t) are always 32 bits.
	111	//
	112	// Register use:
	113	// r0 = temp
	114	// r2 = temp
	115	// r3 = original ptr, not changed since memset returns it
	116	// r4 = count of bytes to set
	117	// r5 = working operand ptr ("rp")
	118	// r6 = value to store (usually 0)
	119	// r7-r9 = temps
	120	// r10 = feature flags
	121	// r11 = old MSR (if bzero_phys)
	122	// r12 = return address (if bzero_phys)
	123	// cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
	124
	125	.align 5
	126	LEXT(memset) // void * memset(void *b, int c, size_t len);
	127	andi. r6,r4,0xFF // copy value to working register, test for 0
	128	mr r4,r5 // move length to working register
	129	bne-- memset1 // skip if nonzero
	130	LEXT(bzero) // void bzero(void *b, size_t len);
	131	dcbtst 0,r3 // touch in 1st cache block
	132	mfsprg r10,2 // get features
	133	li r6,0 // get a 0
	134	neg r7,r3 // start to compute #bytes to align
	135	andi. r0,r10,pf128Byte+pf32Byte // get cache line size
	136	mtcrf 0x02,r10 // put pf128Byte etc in cr6
	137	cmplw r4,r0 // operand length >= cache line size?
	138	mr r5,r3 // make copy of operand ptr (can't change r3)
	139	blt bzero_tail // too short for dcbz (or dcbz128)
	140	rlwinm r0,r7,0,0x1F // get #bytes to 32-byte align
	141	rlwinm r9,r7,0,0x7F // get #bytes to 128-byte align
	142	bt++ pf128Byteb,bzero_128 // skip if 128-byte processor
	143
	144	// Operand length >=32 and cache line size is 32.
	145	// r0 = #bytes to 32-byte align
	146	// r4 = length
	147	// r5 = ptr to operand
	148	// r6 = 0
	149
	150	sub r2,r4,r0 // adjust length
	151	cmpwi cr1,r0,0 // already 32-byte aligned?
	152	srwi. r8,r2,5 // get #32-byte chunks
	153	beq bzero_tail // not long enough to dcbz
	154	mtctr r8 // set up loop count
	155	rlwinm r4,r2,0,27,31 // mask down to leftover byte count
	156	beq cr1,bz_dcbz32 // skip if already 32-byte aligned
	157
	158	// 32-byte align. We just store 32 0s, rather than test and use conditional
	159	// branches. This is usually faster, because there are no mispredicts.
	160
	161	stw r6,0(r5) // zero next 32 bytes
	162	stw r6,4(r5)
	163	stw r6,8(r5)
	164	stw r6,12(r5)
	165	stw r6,16(r5)
	166	stw r6,20(r5)
	167	stw r6,24(r5)
	168	stw r6,28(r5)
	169	add r5,r5,r0 // now r5 is 32-byte aligned
	170	b bz_dcbz32
	171
	172	// Loop doing 32-byte version of DCBZ instruction.
	173
	174	.align 4 // align the inner loop
	175	bz_dcbz32:
	176	dcbz 0,r5 // zero another 32 bytes
	177	addi r5,r5,32
	178	bdnz bz_dcbz32
	179
	180	// Store trailing bytes. This routine is used both by bzero and memset.
	181	// r4 = #bytes to store (may be large if memset)
	182	// r5 = address
	183	// r6 = value to store (in all 8 bytes)
	184	// cr6 = pf64Bit etc flags
	185
	186	bzero_tail:
	187	srwi. r0,r4,4 // get #(16-byte-chunks)
	188	mtcrf 0x01,r4 // remaining byte count to cr7
	189	beq bzt3 // no 16-byte chunks
	190	mtctr r0 // set up loop count
	191	bt++ pf64Bitb,bzt2 // skip if 64-bit processor
	192	b bzt1
	193	.align 5
	194	bzt1: // loop over 16-byte chunks on 32-bit processor
	195	stw r6,0(r5)
	196	stw r6,4(r5)
	197	stw r6,8(r5)
	198	stw r6,12(r5)
	199	addi r5,r5,16
	200	bdnz bzt1
	201	b bzt3
	202	.align 5
	203	bzt2: // loop over 16-byte chunks on 64-bit processor
	204	std r6,0(r5)
	205	std r6,8(r5)
	206	addi r5,r5,16
	207	bdnz bzt2
	208	bf 28,bzt4 // 8-byte chunk?
	209	std r6,0(r5)
	210	addi r5,r5,8
	211	b bzt4
	212	bzt3:
	213	bf 28,bzt4 // 8-byte chunk?
	214	stw r6,0(r5)
	215	stw r6,4(r5)
	216	addi r5,r5,8
	217	bzt4:
	218	bf 29,bzt5 // word?
	219	stw r6,0(r5)
	220	addi r5,r5,4
	221	bzt5:
	222	bf 30,bzt6 // halfword?
	223	sth r6,0(r5)
	224	addi r5,r5,2
	225	bzt6:
	226	bflr 31 // byte?
	227	stb r6,0(r5)
	228	blr
	229
	230	// Operand length is >=128 and cache line size is 128. We assume that
	231	// because the linesize is 128 bytes, this is a 64-bit processor.
	232	// r4 = length
	233	// r5 = ptr to operand
	234	// r6 = 0
	235	// r7 = neg(r5)
	236	// r9 = #bytes to 128-byte align
	237
	238	.align 5
	239	bzero_128:
	240	sub r2,r4,r9 // r2 <- length remaining after cache-line aligning
	241	rlwinm r0,r7,0,0xF // r0 <- #bytes to 16-byte align
	242	srwi. r8,r2,7 // r8 <- number of cache lines to 0
	243	std r6,0(r5) // always store 16 bytes to 16-byte align...
	244	std r6,8(r5) // ...even if too short for dcbz128
	245	add r5,r5,r0 // 16-byte align ptr
	246	sub r4,r4,r0 // adjust count
	247	beq bzero_tail // r8==0, not long enough to dcbz128
	248	sub. r7,r9,r0 // get #bytes remaining to 128-byte align
	249	rlwinm r4,r2,0,0x7F // r4 <- length remaining after dcbz128'ing
	250	mtctr r8 // set up dcbz128 loop
	251	beq bz_dcbz128 // already 128-byte aligned
	252	b bz_align // enter loop over 16-byte chunks
	253
	254	// 128-byte align by looping over 16-byte chunks.
	255
	256	.align 5
	257	bz_align: // loop over 16-byte chunks
	258	subic. r7,r7,16 // more to go?
	259	std r6,0(r5)
	260	std r6,8(r5)
	261	addi r5,r5,16
	262	bgt bz_align
	263
	264	b bz_dcbz128 // enter dcbz128 loop
	265
	266	// Loop over 128-byte cache lines.
	267	// r4 = length remaining after cache lines (0..127)
	268	// r5 = ptr (128-byte aligned)
	269	// r6 = 0
	270	// ctr = count of cache lines to 0
	271
	272	.align 5
	273	bz_dcbz128:
	274	dcbz128 0,r5 // zero a 128-byte cache line
	275	addi r5,r5,128
	276	bdnz bz_dcbz128
	277
	278	b bzero_tail // handle leftovers
	279
	280
	281	// Handle memset() for nonzero values. This case is relatively infrequent;
	282	// the large majority of memset() calls are for 0.
	283	// r3 = ptr
	284	// r4 = count
	285	// r6 = value in lower byte (nonzero)
	286
	287	memset1:
	288	cmplwi r4,16 // too short to bother aligning?
	289	rlwimi r6,r6,8,16,23 // replicate value to low 2 bytes
	290	mr r5,r3 // make working copy of operand ptr
	291	rlwimi r6,r6,16,0,15 // value now in all 4 bytes
	292	blt bzero_tail // length<16, we won't be using "std"
	293	mfsprg r10,2 // get feature flags
	294	neg r7,r5 // start to compute #bytes to align
	295	rlwinm r6,r6,0,1,0 // value now in all 8 bytes (if 64-bit)
	296	andi. r0,r7,7 // r6 <- #bytes to doubleword align
	297	stw r6,0(r5) // store 8 bytes to avoid a loop
	298	stw r6,4(r5)
	299	mtcrf 0x02,r10 // get pf64Bit flag etc in cr6
	300	sub r4,r4,r0 // adjust count
	301	add r5,r5,r0 // doubleword align ptr
	302	b bzero_tail
	303
	304
	305