]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/ppc/commpage/memset_g5.s
xnu-1228.12.14.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / memset_g5.s
... / ...
CommitLineData
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/appleapiopts.h>
30#include <ppc/asm.h>
31#include <machine/cpu_capabilities.h>
32#include <machine/commpage.h>
33
34 .text
35 .align 2
36/*
37 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
38 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
39 * simple transformations:
40 * - all word compares are changed to doubleword
41 * - all "srwi[.]" opcodes are changed to "srdi[.]"
42 * Nothing else is done. For this to work, the following rules must be
43 * carefully followed:
44 * - do not use carry or overflow
45 * - only use record mode if you are sure the results are mode-invariant
46 * for example, all "andi." and almost all "rlwinm." are fine
47 * - do not use "slwi", "slw", or "srw"
48 * An imaginative programmer could break the porting model in other ways, but the above
49 * are the most likely problem areas. It is perhaps surprising how well in practice
50 * this simple method works.
51 */
52
53/* *********************
54 * * M E M S E T _ G 5 *
55 * *********************
56 *
57 * This is a subroutine called by Libc memset and memset_pattern for large nonzero
58 * operands (zero operands are funneled into bzero.) This version is for
59 * 64-bit processors with a 128-byte cache line and Altivec.
60 *
61 * Registers at entry:
62 * r4 = count of bytes to store (must be >= 32)
63 * r8 = ptr to the 1st byte to store (16-byte aligned)
64 * r9 = ptr to 16-byte pattern to store (16-byte aligned)
65 * When we return:
66 * r3 = not changed, since memset returns it
67 * r4 = bytes remaining to store (will be <32)
68 * r7 = not changed
69 * r8 = ptr to next byte to store (still 16-byte aligned)
70 * r12 = not changed (holds return value for memset)
71 */
72
73#define kBig (3*128) // big enough to warrant using dcbz (NB: must be >= 3*128)
74
75 .align 5
76memset_g5:
77 cmplwi cr1,r4,kBig // big enough to warrant using dcbz?
78 neg r10,r8 // start to align ptr
79 mfspr r2,vrsave // we'll be using VRs
80 andi. r10,r10,0x70 // get #bytes to cache line align
81 oris r0,r2,0x8000 // we use vr0
82 mtspr vrsave,r0
83 li r5,16 // get offsets for "stvx"
84 lvx v0,0,r9 // load the pattern into v0
85 li r6,32
86 blt cr1,LShort // not big enough to bother with dcbz
87 li r9,48
88
89 // cache line align
90
91 beq 2f // already aligned
921:
93 subic. r10,r10,16 // more to go?
94 stvx v0,0,r8
95 addi r8,r8,16
96 subi r4,r4,16
97 bne 1b
98
99 // Loop over cache lines. This code uses a private protocol with the kernel:
100 // when the kernel emulates an alignment exception on a DCBZ that occurs in the
101 // commpage, it zeroes CR7. We use this to detect the case where we are operating on
102 // uncached memory, and do not use DCBZ again in this code. We assume that either
103 // all the operand is cacheable or none of it is, so we only check the first DCBZ.
1042:
105 cmpw cr7,r3,r3 // set cr7_eq (kernel will clear if DCBZ faults)
106 dcbzl 0,r8 // zero first cache line (clearing cr7 if alignment exception)
107 srwi r0,r4,7 // get #cache lines (>=2)
108 rlwinm r4,r4,0,0x7F // mask down to residual count (0..127)
109 bne-- cr7,LNoDcbz // exit if we took alignment exception on the first DCBZ
110 subic r0,r0,1 // loop 1-too-few times
111 li r11,128 // set DCBZ look-ahead
112 mtctr r0
113 b 3f // use loop that DCBZs
114
115 // Loop over cache lines. We DCBZ one line ahead, which is a little faster.
116
117 .align 5
1183:
119 dcbzl r11,r8 // zero one line ahead
120 addi r10,r8,64
121 stvx v0,0,r8
122 stvx v0,r5,r8
123 stvx v0,r6,r8
124 stvx v0,r9,r8
125 addi r8,r8,128
126 stvx v0,0,r10
127 stvx v0,r5,r10
128 stvx v0,r6,r10
129 stvx v0,r9,r10
130 bdnz++ 3b
131
132 li r0,1 // we've already DCBZ'd the last line
133LNoDcbz: // r0: loop count
134 mtctr r0
135
136 // Loop which does not DCBZ. Normally this is only used for last cache line,
137 // because we've already zeroed it.
1384:
139 addi r10,r8,64
140 stvx v0,0,r8
141 stvx v0,r5,r8
142 stvx v0,r6,r8
143 stvx v0,r9,r8
144 addi r8,r8,128
145 stvx v0,0,r10
146 stvx v0,r5,r10
147 stvx v0,r6,r10
148 stvx v0,r9,r10
149 bdnz-- 4b // optimize for the cacheable case
150
151 // loop over 32-byte chunks
152LShort:
153 srwi. r0,r4,5 // get count of 32-byte chunks
154 rlwinm r4,r4,0,0x1F // mask down to residual count (0..31)
155 beq 7f // no chunks so done
156 mtctr r0
1576:
158 stvx v0,0,r8
159 stvx v0,r5,r8
160 addi r8,r8,32
161 bdnz++ 6b
1627:
163 mtspr vrsave,r2 // restore caller's vrsave
164 blr
165
166
167 COMMPAGE_DESCRIPTOR(memset_g5,_COMM_PAGE_MEMSET_PATTERN,kCache128+k64Bit+kHasAltivec,0, \
168 kCommPageBoth+kPort32to64)