]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/bzero.s
xnu-792.tar.gz
[apple/xnu.git] / osfmk / ppc / bzero.s
1 /*
2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #include <ppc/asm.h>
24 #include <ppc/exception.h>
25 #include <assym.s>
26
27 .text
28 .align 2
29 .globl _memset
30 .globl _bzero
31 .globl _bzero_nc
32 .globl _bzero_phys
33
34
35 // ***********************
36 // * B Z E R O _ P H Y S *
37 // ***********************
38 //
39 // void bzero_phys(addr64_t phys_addr, uint32_t length);
40 //
41 // Takes a phys addr in (r3,r4), and length in r5. We leave cache on.
42
43 .align 5
44 LEXT(bzero_phys)
45 mflr r12 // save return address
46 rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3
47 rlwimi r3,r4,0,0,31
48 mr r4,r5 // put length where bzero() expects it
49 bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11
50 bl EXT(bzero) // use normal bzero() routine
51 mtlr r12 // restore return
52 b EXT(ml_restore) // restore MSR, turning DR on and SF off
53
54
55 // *******************
56 // * B Z E R O _ N C *
57 // *******************
58 //
59 // void bzero_nc(char *addr, unsigned int length);
60 //
61 // For use with uncached memory. Doesn't seem to be used at all, so probably not
62 // performance critical. NB: we must avoid unaligned stores, because some
63 // machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
64 // memory. Of course, we must also avoid dcbz.
65
66 LEXT(bzero_nc)
67 cmplwi cr1,r4,20 // too short to bother with 16-byte loops?
68 cmplwi cr7,r4,0 // check for (len==0)
69 li r6,0 // get a 0
70 bge cr1,bznc1 // skip if length >=20
71 mtctr r4 // set up byte loop
72 beqlr-- cr7 // done if len=0
73
74 // Short operands, loop over bytes.
75
76 bznc0:
77 stb r6,0(r3)
78 addi r3,r3,1
79 bdnz bznc0
80 blr
81
82 // Handle operands long enough to do doubleword stores; we must doubleword
83 // align, to avoid alignment exceptions.
84
85 bznc1:
86 neg r7,r3 // start to compute #bytes to align
87 mfsprg r10,2 // get feature flags
88 andi. r0,r7,7 // get #bytes to doubleword align
89 mr r5,r3 // make copy of operand ptr as bcopy expects
90 mtcrf 0x02,r10 // put pf64Bitb etc in cr6
91 beq bzero_tail // already doubleword aligned
92 sub r4,r4,r0 // adjust count
93 mtctr r0 // set up loop
94 bznc2: // zero bytes until doubleword aligned
95 stb r6,0(r5)
96 addi r5,r5,1
97 bdnz bznc2
98 b bzero_tail // join bzero, now that r5 is aligned
99
100
101 // ************* ***************
102 // * B Z E R O * and * M E M S E T *
103 // ************* ***************
104 //
105 // void * memset(void *b, int c, size_t len);
106 // void bzero(void *b, size_t len);
107 //
108 // These routines support G3, G4, and the 970, and run in both 32 and
109 // 64-bit mode. Lengths (size_t) are always 32 bits.
110 //
111 // Register use:
112 // r0 = temp
113 // r2 = temp
114 // r3 = original ptr, not changed since memset returns it
115 // r4 = count of bytes to set
116 // r5 = working operand ptr ("rp")
117 // r6 = value to store (usually 0)
118 // r7-r9 = temps
119 // r10 = feature flags
120 // r11 = old MSR (if bzero_phys)
121 // r12 = return address (if bzero_phys)
122 // cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
123
124 .align 5
125 LEXT(memset) // void * memset(void *b, int c, size_t len);
126 andi. r6,r4,0xFF // copy value to working register, test for 0
127 mr r4,r5 // move length to working register
128 bne-- memset1 // skip if nonzero
129 LEXT(bzero) // void bzero(void *b, size_t len);
130 dcbtst 0,r3 // touch in 1st cache block
131 mfsprg r10,2 // get features
132 li r6,0 // get a 0
133 neg r7,r3 // start to compute #bytes to align
134 andi. r0,r10,pf128Byte+pf32Byte // get cache line size
135 mtcrf 0x02,r10 // put pf128Byte etc in cr6
136 cmplw r4,r0 // operand length >= cache line size?
137 mr r5,r3 // make copy of operand ptr (can't change r3)
138 blt bzero_tail // too short for dcbz (or dcbz128)
139 rlwinm r0,r7,0,0x1F // get #bytes to 32-byte align
140 rlwinm r9,r7,0,0x7F // get #bytes to 128-byte align
141 bt++ pf128Byteb,bzero_128 // skip if 128-byte processor
142
143 // Operand length >=32 and cache line size is 32.
144 // r0 = #bytes to 32-byte align
145 // r4 = length
146 // r5 = ptr to operand
147 // r6 = 0
148
149 sub r2,r4,r0 // adjust length
150 cmpwi cr1,r0,0 // already 32-byte aligned?
151 srwi. r8,r2,5 // get #32-byte chunks
152 beq bzero_tail // not long enough to dcbz
153 mtctr r8 // set up loop count
154 rlwinm r4,r2,0,27,31 // mask down to leftover byte count
155 beq cr1,bz_dcbz32 // skip if already 32-byte aligned
156
157 // 32-byte align. We just store 32 0s, rather than test and use conditional
158 // branches. This is usually faster, because there are no mispredicts.
159
160 stw r6,0(r5) // zero next 32 bytes
161 stw r6,4(r5)
162 stw r6,8(r5)
163 stw r6,12(r5)
164 stw r6,16(r5)
165 stw r6,20(r5)
166 stw r6,24(r5)
167 stw r6,28(r5)
168 add r5,r5,r0 // now r5 is 32-byte aligned
169 b bz_dcbz32
170
171 // Loop doing 32-byte version of DCBZ instruction.
172
173 .align 4 // align the inner loop
174 bz_dcbz32:
175 dcbz 0,r5 // zero another 32 bytes
176 addi r5,r5,32
177 bdnz bz_dcbz32
178
179 // Store trailing bytes. This routine is used both by bzero and memset.
180 // r4 = #bytes to store (may be large if memset)
181 // r5 = address
182 // r6 = value to store (in all 8 bytes)
183 // cr6 = pf64Bit etc flags
184
185 bzero_tail:
186 srwi. r0,r4,4 // get #(16-byte-chunks)
187 mtcrf 0x01,r4 // remaining byte count to cr7
188 beq bzt3 // no 16-byte chunks
189 mtctr r0 // set up loop count
190 bt++ pf64Bitb,bzt2 // skip if 64-bit processor
191 b bzt1
192 .align 5
193 bzt1: // loop over 16-byte chunks on 32-bit processor
194 stw r6,0(r5)
195 stw r6,4(r5)
196 stw r6,8(r5)
197 stw r6,12(r5)
198 addi r5,r5,16
199 bdnz bzt1
200 b bzt3
201 .align 5
202 bzt2: // loop over 16-byte chunks on 64-bit processor
203 std r6,0(r5)
204 std r6,8(r5)
205 addi r5,r5,16
206 bdnz bzt2
207 bf 28,bzt4 // 8-byte chunk?
208 std r6,0(r5)
209 addi r5,r5,8
210 b bzt4
211 bzt3:
212 bf 28,bzt4 // 8-byte chunk?
213 stw r6,0(r5)
214 stw r6,4(r5)
215 addi r5,r5,8
216 bzt4:
217 bf 29,bzt5 // word?
218 stw r6,0(r5)
219 addi r5,r5,4
220 bzt5:
221 bf 30,bzt6 // halfword?
222 sth r6,0(r5)
223 addi r5,r5,2
224 bzt6:
225 bflr 31 // byte?
226 stb r6,0(r5)
227 blr
228
229 // Operand length is >=128 and cache line size is 128. We assume that
230 // because the linesize is 128 bytes, this is a 64-bit processor.
231 // r4 = length
232 // r5 = ptr to operand
233 // r6 = 0
234 // r7 = neg(r5)
235 // r9 = #bytes to 128-byte align
236
237 .align 5
238 bzero_128:
239 sub r2,r4,r9 // r2 <- length remaining after cache-line aligning
240 rlwinm r0,r7,0,0xF // r0 <- #bytes to 16-byte align
241 srwi. r8,r2,7 // r8 <- number of cache lines to 0
242 std r6,0(r5) // always store 16 bytes to 16-byte align...
243 std r6,8(r5) // ...even if too short for dcbz128
244 add r5,r5,r0 // 16-byte align ptr
245 sub r4,r4,r0 // adjust count
246 beq bzero_tail // r8==0, not long enough to dcbz128
247 sub. r7,r9,r0 // get #bytes remaining to 128-byte align
248 rlwinm r4,r2,0,0x7F // r4 <- length remaining after dcbz128'ing
249 mtctr r8 // set up dcbz128 loop
250 beq bz_dcbz128 // already 128-byte aligned
251 b bz_align // enter loop over 16-byte chunks
252
253 // 128-byte align by looping over 16-byte chunks.
254
255 .align 5
256 bz_align: // loop over 16-byte chunks
257 subic. r7,r7,16 // more to go?
258 std r6,0(r5)
259 std r6,8(r5)
260 addi r5,r5,16
261 bgt bz_align
262
263 b bz_dcbz128 // enter dcbz128 loop
264
265 // Loop over 128-byte cache lines.
266 // r4 = length remaining after cache lines (0..127)
267 // r5 = ptr (128-byte aligned)
268 // r6 = 0
269 // ctr = count of cache lines to 0
270
271 .align 5
272 bz_dcbz128:
273 dcbz128 0,r5 // zero a 128-byte cache line
274 addi r5,r5,128
275 bdnz bz_dcbz128
276
277 b bzero_tail // handle leftovers
278
279
280 // Handle memset() for nonzero values. This case is relatively infrequent;
281 // the large majority of memset() calls are for 0.
282 // r3 = ptr
283 // r4 = count
284 // r6 = value in lower byte (nonzero)
285
286 memset1:
287 cmplwi r4,16 // too short to bother aligning?
288 rlwimi r6,r6,8,16,23 // replicate value to low 2 bytes
289 mr r5,r3 // make working copy of operand ptr
290 rlwimi r6,r6,16,0,15 // value now in all 4 bytes
291 blt bzero_tail // length<16, we won't be using "std"
292 mfsprg r10,2 // get feature flags
293 neg r7,r5 // start to compute #bytes to align
294 rlwinm r6,r6,0,1,0 // value now in all 8 bytes (if 64-bit)
295 andi. r0,r7,7 // r6 <- #bytes to doubleword align
296 stw r6,0(r5) // store 8 bytes to avoid a loop
297 stw r6,4(r5)
298 mtcrf 0x02,r10 // get pf64Bit flag etc in cr6
299 sub r4,r4,r0 // adjust count
300 add r5,r5,r0 // doubleword align ptr
301 b bzero_tail
302
303
304