]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/memset_g5.s
xnu-792.25.20.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / memset_g5.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #define ASSEMBLER
24 #include <sys/appleapiopts.h>
25 #include <ppc/asm.h>
26 #include <machine/cpu_capabilities.h>
27 #include <machine/commpage.h>
28
29 .text
30 .align 2
31 /*
32 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
33 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
34 * simple transformations:
35 * - all word compares are changed to doubleword
36 * - all "srwi[.]" opcodes are changed to "srdi[.]"
37 * Nothing else is done. For this to work, the following rules must be
38 * carefully followed:
39 * - do not use carry or overflow
40 * - only use record mode if you are sure the results are mode-invariant
41 * for example, all "andi." and almost all "rlwinm." are fine
42 * - do not use "slwi", "slw", or "srw"
43 * An imaginative programmer could break the porting model in other ways, but the above
44 * are the most likely problem areas. It is perhaps surprising how well in practice
45 * this simple method works.
46 */
47
48 /* *********************
49 * * M E M S E T _ G 5 *
50 * *********************
51 *
52 * This is a subroutine called by Libc memset and memset_pattern for large nonzero
53 * operands (zero operands are funneled into bzero.) This version is for
54 * 64-bit processors with a 128-byte cache line and Altivec.
55 *
56 * Registers at entry:
57 * r4 = count of bytes to store (must be >= 32)
58 * r8 = ptr to the 1st byte to store (16-byte aligned)
59 * r9 = ptr to 16-byte pattern to store (16-byte aligned)
60 * When we return:
61 * r3 = not changed, since memset returns it
62 * r4 = bytes remaining to store (will be <32)
63 * r7 = not changed
64 * r8 = ptr to next byte to store (still 16-byte aligned)
65 * r12 = not changed (holds return value for memset)
66 */
67
68 #define kBig (3*128) // big enough to warrant using dcbz (NB: must be >= 3*128)
69
70 .align 5
71 memset_g5:
72 cmplwi cr1,r4,kBig // big enough to warrant using dcbz?
73 neg r10,r8 // start to align ptr
74 mfspr r2,vrsave // we'll be using VRs
75 andi. r10,r10,0x70 // get #bytes to cache line align
76 oris r0,r2,0x8000 // we use vr0
77 mtspr vrsave,r0
78 li r5,16 // get offsets for "stvx"
79 lvx v0,0,r9 // load the pattern into v0
80 li r6,32
81 blt cr1,LShort // not big enough to bother with dcbz
82 li r9,48
83
84 // cache line align
85
86 beq 2f // already aligned
87 1:
88 subic. r10,r10,16 // more to go?
89 stvx v0,0,r8
90 addi r8,r8,16
91 subi r4,r4,16
92 bne 1b
93
94 // Loop over cache lines. This code uses a private protocol with the kernel:
95 // when the kernel emulates an alignment exception on a DCBZ that occurs in the
96 // commpage, it zeroes CR7. We use this to detect the case where we are operating on
97 // uncached memory, and do not use DCBZ again in this code. We assume that either
98 // all the operand is cacheable or none of it is, so we only check the first DCBZ.
99 2:
100 cmpw cr7,r3,r3 // set cr7_eq (kernel will clear if DCBZ faults)
101 dcbzl 0,r8 // zero first cache line (clearing cr7 if alignment exception)
102 srwi r0,r4,7 // get #cache lines (>=2)
103 rlwinm r4,r4,0,0x7F // mask down to residual count (0..127)
104 bne-- cr7,LNoDcbz // exit if we took alignment exception on the first DCBZ
105 subic r0,r0,1 // loop 1-too-few times
106 li r11,128 // set DCBZ look-ahead
107 mtctr r0
108 b 3f // use loop that DCBZs
109
110 // Loop over cache lines. We DCBZ one line ahead, which is a little faster.
111
112 .align 5
113 3:
114 dcbzl r11,r8 // zero one line ahead
115 addi r10,r8,64
116 stvx v0,0,r8
117 stvx v0,r5,r8
118 stvx v0,r6,r8
119 stvx v0,r9,r8
120 addi r8,r8,128
121 stvx v0,0,r10
122 stvx v0,r5,r10
123 stvx v0,r6,r10
124 stvx v0,r9,r10
125 bdnz++ 3b
126
127 li r0,1 // we've already DCBZ'd the last line
128 LNoDcbz: // r0: loop count
129 mtctr r0
130
131 // Loop which does not DCBZ. Normally this is only used for last cache line,
132 // because we've already zeroed it.
133 4:
134 addi r10,r8,64
135 stvx v0,0,r8
136 stvx v0,r5,r8
137 stvx v0,r6,r8
138 stvx v0,r9,r8
139 addi r8,r8,128
140 stvx v0,0,r10
141 stvx v0,r5,r10
142 stvx v0,r6,r10
143 stvx v0,r9,r10
144 bdnz-- 4b // optimize for the cacheable case
145
146 // loop over 32-byte chunks
147 LShort:
148 srwi. r0,r4,5 // get count of 32-byte chunks
149 rlwinm r4,r4,0,0x1F // mask down to residual count (0..31)
150 beq 7f // no chunks so done
151 mtctr r0
152 6:
153 stvx v0,0,r8
154 stvx v0,r5,r8
155 addi r8,r8,32
156 bdnz++ 6b
157 7:
158 mtspr vrsave,r2 // restore caller's vrsave
159 blr
160
161
162 COMMPAGE_DESCRIPTOR(memset_g5,_COMM_PAGE_MEMSET_PATTERN,kCache128+k64Bit+kHasAltivec,0, \
163 kCommPageBoth+kPort32to64)