]> git.saurik.com Git - apple/xnu.git/blame - osfmk/ppc/commpage/memset_g5.s
xnu-792.12.6.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / memset_g5.s
CommitLineData
91447636
A
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
8ad349bb 4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
91447636 5 *
8ad349bb
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
91447636
A
29 */
30
31#define ASSEMBLER
32#include <sys/appleapiopts.h>
33#include <ppc/asm.h>
34#include <machine/cpu_capabilities.h>
35#include <machine/commpage.h>
36
37 .text
38 .align 2
39/*
40 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
41 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
42 * simple transformations:
43 * - all word compares are changed to doubleword
44 * - all "srwi[.]" opcodes are changed to "srdi[.]"
45 * Nothing else is done. For this to work, the following rules must be
46 * carefully followed:
47 * - do not use carry or overflow
48 * - only use record mode if you are sure the results are mode-invariant
49 * for example, all "andi." and almost all "rlwinm." are fine
50 * - do not use "slwi", "slw", or "srw"
51 * An imaginative programmer could break the porting model in other ways, but the above
52 * are the most likely problem areas. It is perhaps surprising how well in practice
53 * this simple method works.
54 */
55
56/* *********************
57 * * M E M S E T _ G 5 *
58 * *********************
59 *
60 * This is a subroutine called by Libc memset and memset_pattern for large nonzero
61 * operands (zero operands are funneled into bzero.) This version is for
62 * 64-bit processors with a 128-byte cache line and Altivec.
63 *
64 * Registers at entry:
65 * r4 = count of bytes to store (must be >= 32)
66 * r8 = ptr to the 1st byte to store (16-byte aligned)
67 * r9 = ptr to 16-byte pattern to store (16-byte aligned)
68 * When we return:
69 * r3 = not changed, since memset returns it
70 * r4 = bytes remaining to store (will be <32)
71 * r7 = not changed
72 * r8 = ptr to next byte to store (still 16-byte aligned)
73 * r12 = not changed (holds return value for memset)
74 */
75
76#define kBig (3*128) // big enough to warrant using dcbz (NB: must be >= 3*128)
77
78 .align 5
79memset_g5:
80 cmplwi cr1,r4,kBig // big enough to warrant using dcbz?
81 neg r10,r8 // start to align ptr
82 mfspr r2,vrsave // we'll be using VRs
83 andi. r10,r10,0x70 // get #bytes to cache line align
84 oris r0,r2,0x8000 // we use vr0
85 mtspr vrsave,r0
86 li r5,16 // get offsets for "stvx"
87 lvx v0,0,r9 // load the pattern into v0
88 li r6,32
89 blt cr1,LShort // not big enough to bother with dcbz
90 li r9,48
91
92 // cache line align
93
94 beq 2f // already aligned
951:
96 subic. r10,r10,16 // more to go?
97 stvx v0,0,r8
98 addi r8,r8,16
99 subi r4,r4,16
100 bne 1b
101
102 // Loop over cache lines. This code uses a private protocol with the kernel:
103 // when the kernel emulates an alignment exception on a DCBZ that occurs in the
104 // commpage, it zeroes CR7. We use this to detect the case where we are operating on
105 // uncached memory, and do not use DCBZ again in this code. We assume that either
106 // all the operand is cacheable or none of it is, so we only check the first DCBZ.
1072:
108 cmpw cr7,r3,r3 // set cr7_eq (kernel will clear if DCBZ faults)
109 dcbzl 0,r8 // zero first cache line (clearing cr7 if alignment exception)
110 srwi r0,r4,7 // get #cache lines (>=2)
111 rlwinm r4,r4,0,0x7F // mask down to residual count (0..127)
112 bne-- cr7,LNoDcbz // exit if we took alignment exception on the first DCBZ
113 subic r0,r0,1 // loop 1-too-few times
114 li r11,128 // set DCBZ look-ahead
115 mtctr r0
116 b 3f // use loop that DCBZs
117
118 // Loop over cache lines. We DCBZ one line ahead, which is a little faster.
119
120 .align 5
1213:
122 dcbzl r11,r8 // zero one line ahead
123 addi r10,r8,64
124 stvx v0,0,r8
125 stvx v0,r5,r8
126 stvx v0,r6,r8
127 stvx v0,r9,r8
128 addi r8,r8,128
129 stvx v0,0,r10
130 stvx v0,r5,r10
131 stvx v0,r6,r10
132 stvx v0,r9,r10
133 bdnz++ 3b
134
135 li r0,1 // we've already DCBZ'd the last line
136LNoDcbz: // r0: loop count
137 mtctr r0
138
139 // Loop which does not DCBZ. Normally this is only used for last cache line,
140 // because we've already zeroed it.
1414:
142 addi r10,r8,64
143 stvx v0,0,r8
144 stvx v0,r5,r8
145 stvx v0,r6,r8
146 stvx v0,r9,r8
147 addi r8,r8,128
148 stvx v0,0,r10
149 stvx v0,r5,r10
150 stvx v0,r6,r10
151 stvx v0,r9,r10
152 bdnz-- 4b // optimize for the cacheable case
153
154 // loop over 32-byte chunks
155LShort:
156 srwi. r0,r4,5 // get count of 32-byte chunks
157 rlwinm r4,r4,0,0x1F // mask down to residual count (0..31)
158 beq 7f // no chunks so done
159 mtctr r0
1606:
161 stvx v0,0,r8
162 stvx v0,r5,r8
163 addi r8,r8,32
164 bdnz++ 6b
1657:
166 mtspr vrsave,r2 // restore caller's vrsave
167 blr
168
169
170 COMMPAGE_DESCRIPTOR(memset_g5,_COMM_PAGE_MEMSET_PATTERN,kCache128+k64Bit+kHasAltivec,0, \
171 kCommPageBoth+kPort32to64)