2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
32 #include <sys/appleapiopts.h>
34 #include <machine/cpu_capabilities.h>
35 #include <machine/commpage.h>
41 /* *********************
42 * * M E M S E T _ G 4 *
43 * *********************
45 * This is a subroutine called by Libc memset and memset_pattern for large nonzero
46 * operands (zero operands are funneled into bzero.) This version is for
47 * 32-bit processors with a 32-byte cache line and Altivec.
50 * r4 = count of bytes to store (must be >= 32)
51 * r8 = ptr to the 1st byte to store (16-byte aligned)
52 * r9 = ptr to 16-byte pattern to store (16-byte aligned)
54 * r3 = not changed, since memset returns it
55 * r4 = bytes remaining to store (will be <32)
57 * r8 = ptr to next byte to store (still 16-byte aligned)
58 * r12 = not changed (holds return value for memset)
61 #define kBig (3*64) // big enough to warrant using dcba (NB: must be >= 3*64)
65 cmplwi cr1,r4,kBig // big enough to warrant using dcbz?
66 mfspr r2,vrsave // we'll be using VRs
67 oris r0,r2,0x8000 // we use vr0
68 andi. r5,r8,0x10 // is ptr 32-byte aligned?
70 li r5,16 // get offsets for "stvx"
71 lvx v0,0,r9 // load the pattern into v0
73 blt cr1,LShort // not big enough to bother with dcba
78 beq 2f // already aligned
79 stvx v0,0,r8 // store another 16 bytes to align
83 // Set up for inner loop.
85 srwi r0,r4,6 // get count of 64-byte chunks (>=2)
86 dcba 0,r8 // pre-allocate first cache line (possibly nop'd)
87 rlwinm r4,r4,0,0x3F // mask down to residual count (0..63)
88 subic r0,r0,1 // loop 1-too-few times
89 li r10,64 // get offsets to DCBA one chunk ahead
92 dcba r6,r8 // zero 2nd cache line (possibly nop'd)
93 b 3f // enter DCBA loop
95 // Loop over 64-byte chunks. We DCBA one chunk ahead, which is a little faster.
96 // Note that some G4s do not benefit from the DCBAs. We nop them in that case.
100 dcba r10,r8 // zero one 64-byte chunk ahead (possibly nop'd)
109 // Last chunk, which we've already DCBAd.
117 // loop over 32-byte chunks at end
119 srwi. r0,r4,5 // get count of 32-byte chunks
120 rlwinm r4,r4,0,0x1F // mask down to residual count (0..31)
121 beq 7f // no chunks so done
129 mtspr vrsave,r2 // restore caller's vrsave
133 COMMPAGE_DESCRIPTOR(memset_g4,_COMM_PAGE_MEMSET_PATTERN,kCache32+kHasAltivec,0, \
134 kCommPageDCBA+kCommPage32)