[apple/xnu.git] / osfmk / ppc / commpage / memset_g4.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code 
 * as defined in and that are subject to the Apple Public Source License 
 * Version 2.0 (the 'License'). You may not use this file except in 
 * compliance with the License.  The rights granted to you under the 
 * License may not be used to create, or enable the creation or 
 * redistribution of, unlawful or unlicensed copies of an Apple operating 
 * system, or to circumvent, violate, or enable the circumvention or 
 * violation of, any terms of an Apple operating system software license 
 * agreement.
 *
 * Please obtain a copy of the License at 
 * http://www.opensource.apple.com/apsl/ and read it before using this 
 * file.
 *
 * The Original Code and all software distributed under the License are 
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
 * Please see the License for the specific language governing rights and 
 * limitations under the License.
 *
 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
 */

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
        .align	2


/* *********************
 * * M E M S E T _ G 4 *
 * *********************
 *
 * This is a subroutine called by Libc memset and memset_pattern for large nonzero
 * operands (zero operands are funneled into bzero.)  This version is for
 * 32-bit processors with a 32-byte cache line and Altivec.
 *
 * Registers at entry:
 *		r4 = count of bytes to store (must be >= 32)
 *      r8 = ptr to the 1st byte to store (16-byte aligned)
 *      r9 = ptr to 16-byte pattern to store (16-byte aligned)
 * When we return:
 *		r3 = not changed, since memset returns it
 *      r4 = bytes remaining to store (will be <32)
 *      r7 = not changed
 *      r8 = ptr to next byte to store (still 16-byte aligned)
 *     r12 = not changed (holds return value for memset)
 */

#define kBig    (3*64)                  // big enough to warrant using dcba (NB: must be >= 3*64)

        .align	4
memset_g4:
        cmplwi  cr1,r4,kBig             // big enough to warrant using dcbz?
        mfspr   r2,vrsave               // we'll be using VRs
        oris    r0,r2,0x8000            // we use vr0
        andi.   r5,r8,0x10              // is ptr 32-byte aligned?
        mtspr   vrsave,r0
        li      r5,16                   // get offsets for "stvx"
        lvx     v0,0,r9                 // load the pattern into v0
        li      r6,32
        blt     cr1,LShort              // not big enough to bother with dcba
        li      r9,48
        
        // cache line align
        
        beq     2f                      // already aligned
        stvx    v0,0,r8                 // store another 16 bytes to align
        addi    r8,r8,16
        subi    r4,r4,16
        
        // Set up for inner loop.
2:
        srwi    r0,r4,6                 // get count of 64-byte chunks (>=2)
        dcba    0,r8                    // pre-allocate first cache line (possibly nop'd)
        rlwinm  r4,r4,0,0x3F            // mask down to residual count (0..63)
        subic   r0,r0,1                 // loop 1-too-few times
        li      r10,64                  // get offsets to DCBA one chunk ahead
        li      r11,64+32
        mtctr   r0
        dcba    r6,r8                   // zero 2nd cache line (possibly nop'd)
        b       3f                      // enter DCBA loop
        
        // Loop over 64-byte chunks.  We DCBA one chunk ahead, which is a little faster.
        // Note that some G4s do not benefit from the DCBAs.  We nop them in that case.
        
        .align  4
3:
        dcba    r10,r8                  // zero one 64-byte chunk ahead (possibly nop'd)
        dcba    r11,r8
        stvx    v0,0,r8
        stvx    v0,r5,r8
        stvx    v0,r6,r8
        stvx    v0,r9,r8
        addi    r8,r8,64
        bdnz+   3b
        
        // Last chunk, which we've already DCBAd.

        stvx    v0,0,r8
        stvx    v0,r5,r8
        stvx    v0,r6,r8
        stvx    v0,r9,r8
        addi    r8,r8,64
        
        // loop over 32-byte chunks at end
LShort:
        srwi.   r0,r4,5                 // get count of 32-byte chunks
        rlwinm  r4,r4,0,0x1F            // mask down to residual count (0..31)
        beq     7f                      // no chunks so done
        mtctr   r0
6:
        stvx    v0,0,r8
        stvx    v0,r5,r8
        addi    r8,r8,32
        bdnz    6b
7:
        mtspr   vrsave,r2               // restore caller's vrsave
        blr


	COMMPAGE_DESCRIPTOR(memset_g4,_COMM_PAGE_MEMSET_PATTERN,kCache32+kHasAltivec,0, \
				kCommPageDCBA+kCommPage32)
Commit	Line	Data
91447636 A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
8ad349bb	4	* @APPLE_LICENSE_OSREFERENCE_HEADER_START@
91447636	5	*
8ad349bb A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the
	10	* License may not be used to create, or enable the creation or
	11	* redistribution of, unlawful or unlicensed copies of an Apple operating
	12	* system, or to circumvent, violate, or enable the circumvention or
	13	* violation of, any terms of an Apple operating system software license
	14	* agreement.
	15	*
	16	* Please obtain a copy of the License at
	17	* http://www.opensource.apple.com/apsl/ and read it before using this
	18	* file.
	19	*
	20	* The Original Code and all software distributed under the License are
	21	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	22	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	23	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	24	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	25	* Please see the License for the specific language governing rights and
	26	* limitations under the License.
	27	*
	28	* @APPLE_LICENSE_OSREFERENCE_HEADER_END@
91447636 A	29	*/
	30
	31	#define ASSEMBLER
	32	#include <sys/appleapiopts.h>
	33	#include <ppc/asm.h>
	34	#include <machine/cpu_capabilities.h>
	35	#include <machine/commpage.h>
	36
	37	.text
	38	.align 2
	39
	40
	41	/* *********************
	42	* * M E M S E T _ G 4 *
	43	* *********************
	44	*
	45	* This is a subroutine called by Libc memset and memset_pattern for large nonzero
	46	* operands (zero operands are funneled into bzero.) This version is for
	47	* 32-bit processors with a 32-byte cache line and Altivec.
	48	*
	49	* Registers at entry:
	50	* r4 = count of bytes to store (must be >= 32)
	51	* r8 = ptr to the 1st byte to store (16-byte aligned)
	52	* r9 = ptr to 16-byte pattern to store (16-byte aligned)
	53	* When we return:
	54	* r3 = not changed, since memset returns it
	55	* r4 = bytes remaining to store (will be <32)
	56	* r7 = not changed
	57	* r8 = ptr to next byte to store (still 16-byte aligned)
	58	* r12 = not changed (holds return value for memset)
	59	*/
	60
	61	#define kBig (364) // big enough to warrant using dcba (NB: must be >= 364)
	62
	63	.align 4
	64	memset_g4:
	65	cmplwi cr1,r4,kBig // big enough to warrant using dcbz?
	66	mfspr r2,vrsave // we'll be using VRs
	67	oris r0,r2,0x8000 // we use vr0
	68	andi. r5,r8,0x10 // is ptr 32-byte aligned?
	69	mtspr vrsave,r0
	70	li r5,16 // get offsets for "stvx"
	71	lvx v0,0,r9 // load the pattern into v0
	72	li r6,32
	73	blt cr1,LShort // not big enough to bother with dcba
	74	li r9,48
	75
	76	// cache line align
	77
	78	beq 2f // already aligned
	79	stvx v0,0,r8 // store another 16 bytes to align
	80	addi r8,r8,16
	81	subi r4,r4,16
	82
	83	// Set up for inner loop.
	84	2:
	85	srwi r0,r4,6 // get count of 64-byte chunks (>=2)
	86	dcba 0,r8 // pre-allocate first cache line (possibly nop'd)
	87	rlwinm r4,r4,0,0x3F // mask down to residual count (0..63)
	88	subic r0,r0,1 // loop 1-too-few times
	89	li r10,64 // get offsets to DCBA one chunk ahead
	90	li r11,64+32
	91	mtctr r0
	92	dcba r6,r8 // zero 2nd cache line (possibly nop'd)
93	b 3f // enter DCBA loop
94
95	// Loop over 64-byte chunks. We DCBA one chunk ahead, which is a little faster.
96	// Note that some G4s do not benefit from the DCBAs. We nop them in that case.
97
98	.align 4
99	3:
100	dcba r10,r8 // zero one 64-byte chunk ahead (possibly nop'd)
101	dcba r11,r8
102	stvx v0,0,r8
103	stvx v0,r5,r8
104	stvx v0,r6,r8
105	stvx v0,r9,r8
106	addi r8,r8,64
107	bdnz+ 3b
108
109	// Last chunk, which we've already DCBAd.
110
111	stvx v0,0,r8
112	stvx v0,r5,r8
113	stvx v0,r6,r8
114	stvx v0,r9,r8
115	addi r8,r8,64
116
117	// loop over 32-byte chunks at end
118	LShort:
119	srwi. r0,r4,5 // get count of 32-byte chunks
120	rlwinm r4,r4,0,0x1F // mask down to residual count (0..31)
121	beq 7f // no chunks so done
122	mtctr r0
123	6:
124	stvx v0,0,r8
125	stvx v0,r5,r8
126	addi r8,r8,32
127	bdnz 6b
128	7:
129	mtspr vrsave,r2 // restore caller's vrsave
130	blr
131
132
133	COMMPAGE_DESCRIPTOR(memset_g4,_COMM_PAGE_MEMSET_PATTERN,kCache32+kHasAltivec,0, \
134	kCommPageDCBA+kCommPage32)