git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22
	23	#define ASSEMBLER
	24	#include <sys/appleapiopts.h>
	25	#include <ppc/asm.h>
	26	#include <machine/cpu_capabilities.h>
	27	#include <machine/commpage.h>
	28
	29	.text
	30	.align 2
	31
	32
	33	/* *********************
	34	* * M E M S E T _ G 4 *
	35	* *********************
	36	*
	37	* This is a subroutine called by Libc memset and memset_pattern for large nonzero
	38	* operands (zero operands are funneled into bzero.) This version is for
	39	* 32-bit processors with a 32-byte cache line and Altivec.
	40	*
	41	* Registers at entry:
	42	* r4 = count of bytes to store (must be >= 32)
	43	* r8 = ptr to the 1st byte to store (16-byte aligned)
	44	* r9 = ptr to 16-byte pattern to store (16-byte aligned)
	45	* When we return:
	46	* r3 = not changed, since memset returns it
	47	* r4 = bytes remaining to store (will be <32)
	48	* r7 = not changed
	49	* r8 = ptr to next byte to store (still 16-byte aligned)
	50	* r12 = not changed (holds return value for memset)
	51	*/
	52
	53	#define kBig (364) // big enough to warrant using dcba (NB: must be >= 364)
	54
	55	.align 4
	56	memset_g4:
	57	cmplwi cr1,r4,kBig // big enough to warrant using dcbz?
	58	mfspr r2,vrsave // we'll be using VRs
	59	oris r0,r2,0x8000 // we use vr0
	60	andi. r5,r8,0x10 // is ptr 32-byte aligned?
	61	mtspr vrsave,r0
	62	li r5,16 // get offsets for "stvx"
	63	lvx v0,0,r9 // load the pattern into v0
	64	li r6,32
	65	blt cr1,LShort // not big enough to bother with dcba
	66	li r9,48
	67
	68	// cache line align
	69
	70	beq 2f // already aligned
	71	stvx v0,0,r8 // store another 16 bytes to align
	72	addi r8,r8,16
	73	subi r4,r4,16
	74
	75	// Set up for inner loop.
	76	2:
	77	srwi r0,r4,6 // get count of 64-byte chunks (>=2)
	78	dcba 0,r8 // pre-allocate first cache line (possibly nop'd)
	79	rlwinm r4,r4,0,0x3F // mask down to residual count (0..63)
	80	subic r0,r0,1 // loop 1-too-few times
	81	li r10,64 // get offsets to DCBA one chunk ahead
	82	li r11,64+32
	83	mtctr r0
	84	dcba r6,r8 // zero 2nd cache line (possibly nop'd)
	85	b 3f // enter DCBA loop
	86
	87	// Loop over 64-byte chunks. We DCBA one chunk ahead, which is a little faster.
	88	// Note that some G4s do not benefit from the DCBAs. We nop them in that case.
	89
	90	.align 4
	91	3:
	92	dcba r10,r8 // zero one 64-byte chunk ahead (possibly nop'd)
	93	dcba r11,r8
	94	stvx v0,0,r8
	95	stvx v0,r5,r8
	96	stvx v0,r6,r8
	97	stvx v0,r9,r8
	98	addi r8,r8,64
	99	bdnz+ 3b
	100
	101	// Last chunk, which we've already DCBAd.
	102
	103	stvx v0,0,r8
	104	stvx v0,r5,r8
	105	stvx v0,r6,r8
	106	stvx v0,r9,r8
	107	addi r8,r8,64
	108
	109	// loop over 32-byte chunks at end
	110	LShort:
	111	srwi. r0,r4,5 // get count of 32-byte chunks
	112	rlwinm r4,r4,0,0x1F // mask down to residual count (0..31)
	113	beq 7f // no chunks so done
	114	mtctr r0
	115	6:
	116	stvx v0,0,r8
	117	stvx v0,r5,r8
	118	addi r8,r8,32
	119	bdnz 6b
	120	7:
	121	mtspr vrsave,r2 // restore caller's vrsave
	122	blr
	123
	124
	125	COMMPAGE_DESCRIPTOR(memset_g4,_COMM_PAGE_MEMSET_PATTERN,kCache32+kHasAltivec,0, \
	126	kCommPageDCBA+kCommPage32)