[apple/xnu.git] / osfmk / ppc / commpage / memset_g5.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code 
 * as defined in and that are subject to the Apple Public Source License 
 * Version 2.0 (the 'License'). You may not use this file except in 
 * compliance with the License.  The rights granted to you under the 
 * License may not be used to create, or enable the creation or 
 * redistribution of, unlawful or unlicensed copies of an Apple operating 
 * system, or to circumvent, violate, or enable the circumvention or 
 * violation of, any terms of an Apple operating system software license 
 * agreement.
 *
 * Please obtain a copy of the License at 
 * http://www.opensource.apple.com/apsl/ and read it before using this 
 * file.
 *
 * The Original Code and all software distributed under the License are 
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
 * Please see the License for the specific language governing rights and 
 * limitations under the License.
 *
 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
 */

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
        .align	2
/*
 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
 * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
 * simple transformations:
 *      - all word compares are changed to doubleword
 *      - all "srwi[.]" opcodes are changed to "srdi[.]"                      
 * Nothing else is done.  For this to work, the following rules must be
 * carefully followed:
 *      - do not use carry or overflow
 *      - only use record mode if you are sure the results are mode-invariant
 *        for example, all "andi." and almost all "rlwinm." are fine
 *      - do not use "slwi", "slw", or "srw"
 * An imaginative programmer could break the porting model in other ways, but the above
 * are the most likely problem areas.  It is perhaps surprising how well in practice
 * this simple method works.
 */        

/* *********************
 * * M E M S E T _ G 5 *
 * *********************
 *
 * This is a subroutine called by Libc memset and memset_pattern for large nonzero
 * operands (zero operands are funneled into bzero.)  This version is for
 * 64-bit processors with a 128-byte cache line and Altivec.
 *
 * Registers at entry:
 *		r4 = count of bytes to store (must be >= 32)
 *      r8 = ptr to the 1st byte to store (16-byte aligned)
 *      r9 = ptr to 16-byte pattern to store (16-byte aligned)
 * When we return:
 *		r3 = not changed, since memset returns it
 *      r4 = bytes remaining to store (will be <32)
 *      r7 = not changed
 *      r8 = ptr to next byte to store (still 16-byte aligned)
 *     r12 = not changed (holds return value for memset)
 */

#define kBig    (3*128)                 // big enough to warrant using dcbz (NB: must be >= 3*128)

        .align	5
memset_g5:
        cmplwi  cr1,r4,kBig             // big enough to warrant using dcbz?
        neg     r10,r8                  // start to align ptr
        mfspr   r2,vrsave               // we'll be using VRs
        andi.   r10,r10,0x70            // get #bytes to cache line align
        oris    r0,r2,0x8000            // we use vr0
        mtspr   vrsave,r0
        li      r5,16                   // get offsets for "stvx"
        lvx     v0,0,r9                 // load the pattern into v0
        li      r6,32
        blt     cr1,LShort              // not big enough to bother with dcbz
        li      r9,48
        
        // cache line align
        
        beq     2f                      // already aligned
1:
        subic.  r10,r10,16              // more to go?
        stvx    v0,0,r8
        addi    r8,r8,16
        subi    r4,r4,16
        bne     1b
        
        // Loop over cache lines.  This code uses a private protocol with the kernel:
        // when the kernel emulates an alignment exception on a DCBZ that occurs in the
        // commpage, it zeroes CR7.  We use this to detect the case where we are operating on
        // uncached memory, and do not use DCBZ again in this code. We assume that either
        // all the operand is cacheable or none of it is, so we only check the first DCBZ.
2:
        cmpw    cr7,r3,r3               // set cr7_eq (kernel will clear if DCBZ faults)
        dcbzl   0,r8                    // zero first cache line (clearing cr7 if alignment exception)
        srwi    r0,r4,7                 // get #cache lines (>=2)
        rlwinm  r4,r4,0,0x7F            // mask down to residual count (0..127)
        bne--   cr7,LNoDcbz             // exit if we took alignment exception on the first DCBZ
        subic   r0,r0,1                 // loop 1-too-few times
        li      r11,128                 // set DCBZ look-ahead
        mtctr   r0
        b       3f                      // use loop that DCBZs
        
        // Loop over cache lines.  We DCBZ one line ahead, which is a little faster.
        
        .align  5
3:
        dcbzl   r11,r8                  // zero one line ahead
        addi    r10,r8,64
        stvx    v0,0,r8
        stvx    v0,r5,r8
        stvx    v0,r6,r8
        stvx    v0,r9,r8
        addi    r8,r8,128
        stvx    v0,0,r10
        stvx    v0,r5,r10
        stvx    v0,r6,r10
        stvx    v0,r9,r10
        bdnz++  3b
        
        li      r0,1                    // we've already DCBZ'd the last line
LNoDcbz:                                // r0: loop count
        mtctr   r0
        
        // Loop which does not DCBZ.  Normally this is only used for last cache line,
        // because we've already zeroed it.
4:        
        addi    r10,r8,64
        stvx    v0,0,r8
        stvx    v0,r5,r8
        stvx    v0,r6,r8
        stvx    v0,r9,r8
        addi    r8,r8,128
        stvx    v0,0,r10
        stvx    v0,r5,r10
        stvx    v0,r6,r10
        stvx    v0,r9,r10
        bdnz--  4b                      // optimize for the cacheable case
        
        // loop over 32-byte chunks
LShort:
        srwi.   r0,r4,5                 // get count of 32-byte chunks
        rlwinm  r4,r4,0,0x1F            // mask down to residual count (0..31)
        beq     7f                      // no chunks so done
        mtctr   r0
6:
        stvx    v0,0,r8
        stvx    v0,r5,r8
        addi    r8,r8,32
        bdnz++  6b
7:
        mtspr   vrsave,r2               // restore caller's vrsave
        blr


	COMMPAGE_DESCRIPTOR(memset_g5,_COMM_PAGE_MEMSET_PATTERN,kCache128+k64Bit+kHasAltivec,0, \
				kCommPageBoth+kPort32to64)
Commit	Line	Data
91447636 A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
8ad349bb	4	* @APPLE_LICENSE_OSREFERENCE_HEADER_START@
91447636	5	*
8ad349bb A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the
	10	* License may not be used to create, or enable the creation or
	11	* redistribution of, unlawful or unlicensed copies of an Apple operating
	12	* system, or to circumvent, violate, or enable the circumvention or
	13	* violation of, any terms of an Apple operating system software license
	14	* agreement.
	15	*
	16	* Please obtain a copy of the License at
	17	* http://www.opensource.apple.com/apsl/ and read it before using this
	18	* file.
	19	*
	20	* The Original Code and all software distributed under the License are
	21	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	22	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	23	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	24	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	25	* Please see the License for the specific language governing rights and
	26	* limitations under the License.
	27	*
	28	* @APPLE_LICENSE_OSREFERENCE_HEADER_END@
91447636 A	29	*/
	30
	31	#define ASSEMBLER
	32	#include <sys/appleapiopts.h>
	33	#include <ppc/asm.h>
	34	#include <machine/cpu_capabilities.h>
	35	#include <machine/commpage.h>
	36
	37	.text
	38	.align 2
	39	/*
	40	* WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
	41	* to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
	42	* simple transformations:
	43	* - all word compares are changed to doubleword
	44	* - all "srwi[.]" opcodes are changed to "srdi[.]"
	45	* Nothing else is done. For this to work, the following rules must be
	46	* carefully followed:
	47	* - do not use carry or overflow
	48	* - only use record mode if you are sure the results are mode-invariant
	49	* for example, all "andi." and almost all "rlwinm." are fine
	50	* - do not use "slwi", "slw", or "srw"
	51	* An imaginative programmer could break the porting model in other ways, but the above
	52	* are the most likely problem areas. It is perhaps surprising how well in practice
	53	* this simple method works.
	54	*/
	55
	56	/* *********************
	57	* * M E M S E T _ G 5 *
	58	* *********************
	59	*
	60	* This is a subroutine called by Libc memset and memset_pattern for large nonzero
	61	* operands (zero operands are funneled into bzero.) This version is for
	62	* 64-bit processors with a 128-byte cache line and Altivec.
	63	*
	64	* Registers at entry:
	65	* r4 = count of bytes to store (must be >= 32)
	66	* r8 = ptr to the 1st byte to store (16-byte aligned)
	67	* r9 = ptr to 16-byte pattern to store (16-byte aligned)
	68	* When we return:
	69	* r3 = not changed, since memset returns it
	70	* r4 = bytes remaining to store (will be <32)
	71	* r7 = not changed
	72	* r8 = ptr to next byte to store (still 16-byte aligned)
	73	* r12 = not changed (holds return value for memset)
	74	*/
	75
	76	#define kBig (3128) // big enough to warrant using dcbz (NB: must be >= 3128)
	77
	78	.align 5
	79	memset_g5:
	80	cmplwi cr1,r4,kBig // big enough to warrant using dcbz?
	81	neg r10,r8 // start to align ptr
	82	mfspr r2,vrsave // we'll be using VRs
	83	andi. r10,r10,0x70 // get #bytes to cache line align
	84	oris r0,r2,0x8000 // we use vr0
	85	mtspr vrsave,r0
	86	li r5,16 // get offsets for "stvx"
	87	lvx v0,0,r9 // load the pattern into v0
	88	li r6,32
	89	blt cr1,LShort // not big enough to bother with dcbz
	90	li r9,48
	91
	92	// cache line align
93
94	beq 2f // already aligned
95	1:
96	subic. r10,r10,16 // more to go?
97	stvx v0,0,r8
98	addi r8,r8,16
99	subi r4,r4,16
100	bne 1b
101
102	// Loop over cache lines. This code uses a private protocol with the kernel:
103	// when the kernel emulates an alignment exception on a DCBZ that occurs in the
104	// commpage, it zeroes CR7. We use this to detect the case where we are operating on
105	// uncached memory, and do not use DCBZ again in this code. We assume that either
106	// all the operand is cacheable or none of it is, so we only check the first DCBZ.
107	2:
108	cmpw cr7,r3,r3 // set cr7_eq (kernel will clear if DCBZ faults)
109	dcbzl 0,r8 // zero first cache line (clearing cr7 if alignment exception)
110	srwi r0,r4,7 // get #cache lines (>=2)
111	rlwinm r4,r4,0,0x7F // mask down to residual count (0..127)
112	bne-- cr7,LNoDcbz // exit if we took alignment exception on the first DCBZ
113	subic r0,r0,1 // loop 1-too-few times
114	li r11,128 // set DCBZ look-ahead
115	mtctr r0
116	b 3f // use loop that DCBZs
117
118	// Loop over cache lines. We DCBZ one line ahead, which is a little faster.
119
120	.align 5
121	3:
122	dcbzl r11,r8 // zero one line ahead
123	addi r10,r8,64
124	stvx v0,0,r8
125	stvx v0,r5,r8
126	stvx v0,r6,r8
127	stvx v0,r9,r8
128	addi r8,r8,128
129	stvx v0,0,r10
130	stvx v0,r5,r10
131	stvx v0,r6,r10
132	stvx v0,r9,r10
133	bdnz++ 3b
134
135	li r0,1 // we've already DCBZ'd the last line
136	LNoDcbz: // r0: loop count
137	mtctr r0
138
139	// Loop which does not DCBZ. Normally this is only used for last cache line,
140	// because we've already zeroed it.
141	4:
142	addi r10,r8,64
143	stvx v0,0,r8
144	stvx v0,r5,r8
145	stvx v0,r6,r8
146	stvx v0,r9,r8
147	addi r8,r8,128
148	stvx v0,0,r10
149	stvx v0,r5,r10
150	stvx v0,r6,r10
151	stvx v0,r9,r10
152	bdnz-- 4b // optimize for the cacheable case
153
154	// loop over 32-byte chunks
155	LShort:
156	srwi. r0,r4,5 // get count of 32-byte chunks
157	rlwinm r4,r4,0,0x1F // mask down to residual count (0..31)
158	beq 7f // no chunks so done
159	mtctr r0
160	6:
161	stvx v0,0,r8
162	stvx v0,r5,r8
163	addi r8,r8,32
164	bdnz++ 6b
165	7:
166	mtspr vrsave,r2 // restore caller's vrsave
167	blr
168
169
170	COMMPAGE_DESCRIPTOR(memset_g5,_COMM_PAGE_MEMSET_PATTERN,kCache128+k64Bit+kHasAltivec,0, \
171	kCommPageBoth+kPort32to64)