]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/ppc/bzero.s
xnu-792.6.56.tar.gz
[apple/xnu.git] / osfmk / ppc / bzero.s
... / ...
CommitLineData
1/*
2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#include <ppc/asm.h>
25#include <ppc/exception.h>
26#include <assym.s>
27
28 .text
29 .align 2
30 .globl _memset
31 .globl _bzero
32 .globl _bzero_nc
33 .globl _bzero_phys
34
35
36// ***********************
37// * B Z E R O _ P H Y S *
38// ***********************
39//
40// void bzero_phys(addr64_t phys_addr, uint32_t length);
41//
42// Takes a phys addr in (r3,r4), and length in r5. We leave cache on.
43
44 .align 5
45LEXT(bzero_phys)
46 mflr r12 // save return address
47 rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3
48 rlwimi r3,r4,0,0,31
49 mr r4,r5 // put length where bzero() expects it
50 bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11
51 bl EXT(bzero) // use normal bzero() routine
52 mtlr r12 // restore return
53 b EXT(ml_restore) // restore MSR, turning DR on and SF off
54
55
56// *******************
57// * B Z E R O _ N C *
58// *******************
59//
60// void bzero_nc(char *addr, unsigned int length);
61//
62// For use with uncached memory. Doesn't seem to be used at all, so probably not
63// performance critical. NB: we must avoid unaligned stores, because some
64// machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
65// memory. Of course, we must also avoid dcbz.
66
67LEXT(bzero_nc)
68 cmplwi cr1,r4,20 // too short to bother with 16-byte loops?
69 cmplwi cr7,r4,0 // check for (len==0)
70 li r6,0 // get a 0
71 bge cr1,bznc1 // skip if length >=20
72 mtctr r4 // set up byte loop
73 beqlr-- cr7 // done if len=0
74
75// Short operands, loop over bytes.
76
77bznc0:
78 stb r6,0(r3)
79 addi r3,r3,1
80 bdnz bznc0
81 blr
82
83// Handle operands long enough to do doubleword stores; we must doubleword
84// align, to avoid alignment exceptions.
85
86bznc1:
87 neg r7,r3 // start to compute #bytes to align
88 mfsprg r10,2 // get feature flags
89 andi. r0,r7,7 // get #bytes to doubleword align
90 mr r5,r3 // make copy of operand ptr as bcopy expects
91 mtcrf 0x02,r10 // put pf64Bitb etc in cr6
92 beq bzero_tail // already doubleword aligned
93 sub r4,r4,r0 // adjust count
94 mtctr r0 // set up loop
95bznc2: // zero bytes until doubleword aligned
96 stb r6,0(r5)
97 addi r5,r5,1
98 bdnz bznc2
99 b bzero_tail // join bzero, now that r5 is aligned
100
101
102// ************* ***************
103// * B Z E R O * and * M E M S E T *
104// ************* ***************
105//
106// void * memset(void *b, int c, size_t len);
107// void bzero(void *b, size_t len);
108//
109// These routines support G3, G4, and the 970, and run in both 32 and
110// 64-bit mode. Lengths (size_t) are always 32 bits.
111//
112// Register use:
113// r0 = temp
114// r2 = temp
115// r3 = original ptr, not changed since memset returns it
116// r4 = count of bytes to set
117// r5 = working operand ptr ("rp")
118// r6 = value to store (usually 0)
119// r7-r9 = temps
120// r10 = feature flags
121// r11 = old MSR (if bzero_phys)
122// r12 = return address (if bzero_phys)
123// cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
124
125 .align 5
126LEXT(memset) // void * memset(void *b, int c, size_t len);
127 andi. r6,r4,0xFF // copy value to working register, test for 0
128 mr r4,r5 // move length to working register
129 bne-- memset1 // skip if nonzero
130LEXT(bzero) // void bzero(void *b, size_t len);
131 dcbtst 0,r3 // touch in 1st cache block
132 mfsprg r10,2 // get features
133 li r6,0 // get a 0
134 neg r7,r3 // start to compute #bytes to align
135 andi. r0,r10,pf128Byte+pf32Byte // get cache line size
136 mtcrf 0x02,r10 // put pf128Byte etc in cr6
137 cmplw r4,r0 // operand length >= cache line size?
138 mr r5,r3 // make copy of operand ptr (can't change r3)
139 blt bzero_tail // too short for dcbz (or dcbz128)
140 rlwinm r0,r7,0,0x1F // get #bytes to 32-byte align
141 rlwinm r9,r7,0,0x7F // get #bytes to 128-byte align
142 bt++ pf128Byteb,bzero_128 // skip if 128-byte processor
143
144// Operand length >=32 and cache line size is 32.
145// r0 = #bytes to 32-byte align
146// r4 = length
147// r5 = ptr to operand
148// r6 = 0
149
150 sub r2,r4,r0 // adjust length
151 cmpwi cr1,r0,0 // already 32-byte aligned?
152 srwi. r8,r2,5 // get #32-byte chunks
153 beq bzero_tail // not long enough to dcbz
154 mtctr r8 // set up loop count
155 rlwinm r4,r2,0,27,31 // mask down to leftover byte count
156 beq cr1,bz_dcbz32 // skip if already 32-byte aligned
157
158// 32-byte align. We just store 32 0s, rather than test and use conditional
159// branches. This is usually faster, because there are no mispredicts.
160
161 stw r6,0(r5) // zero next 32 bytes
162 stw r6,4(r5)
163 stw r6,8(r5)
164 stw r6,12(r5)
165 stw r6,16(r5)
166 stw r6,20(r5)
167 stw r6,24(r5)
168 stw r6,28(r5)
169 add r5,r5,r0 // now r5 is 32-byte aligned
170 b bz_dcbz32
171
172// Loop doing 32-byte version of DCBZ instruction.
173
174 .align 4 // align the inner loop
175bz_dcbz32:
176 dcbz 0,r5 // zero another 32 bytes
177 addi r5,r5,32
178 bdnz bz_dcbz32
179
180// Store trailing bytes. This routine is used both by bzero and memset.
181// r4 = #bytes to store (may be large if memset)
182// r5 = address
183// r6 = value to store (in all 8 bytes)
184// cr6 = pf64Bit etc flags
185
186bzero_tail:
187 srwi. r0,r4,4 // get #(16-byte-chunks)
188 mtcrf 0x01,r4 // remaining byte count to cr7
189 beq bzt3 // no 16-byte chunks
190 mtctr r0 // set up loop count
191 bt++ pf64Bitb,bzt2 // skip if 64-bit processor
192 b bzt1
193 .align 5
194bzt1: // loop over 16-byte chunks on 32-bit processor
195 stw r6,0(r5)
196 stw r6,4(r5)
197 stw r6,8(r5)
198 stw r6,12(r5)
199 addi r5,r5,16
200 bdnz bzt1
201 b bzt3
202 .align 5
203bzt2: // loop over 16-byte chunks on 64-bit processor
204 std r6,0(r5)
205 std r6,8(r5)
206 addi r5,r5,16
207 bdnz bzt2
208 bf 28,bzt4 // 8-byte chunk?
209 std r6,0(r5)
210 addi r5,r5,8
211 b bzt4
212bzt3:
213 bf 28,bzt4 // 8-byte chunk?
214 stw r6,0(r5)
215 stw r6,4(r5)
216 addi r5,r5,8
217bzt4:
218 bf 29,bzt5 // word?
219 stw r6,0(r5)
220 addi r5,r5,4
221bzt5:
222 bf 30,bzt6 // halfword?
223 sth r6,0(r5)
224 addi r5,r5,2
225bzt6:
226 bflr 31 // byte?
227 stb r6,0(r5)
228 blr
229
230// Operand length is >=128 and cache line size is 128. We assume that
231// because the linesize is 128 bytes, this is a 64-bit processor.
232// r4 = length
233// r5 = ptr to operand
234// r6 = 0
235// r7 = neg(r5)
236// r9 = #bytes to 128-byte align
237
238 .align 5
239bzero_128:
240 sub r2,r4,r9 // r2 <- length remaining after cache-line aligning
241 rlwinm r0,r7,0,0xF // r0 <- #bytes to 16-byte align
242 srwi. r8,r2,7 // r8 <- number of cache lines to 0
243 std r6,0(r5) // always store 16 bytes to 16-byte align...
244 std r6,8(r5) // ...even if too short for dcbz128
245 add r5,r5,r0 // 16-byte align ptr
246 sub r4,r4,r0 // adjust count
247 beq bzero_tail // r8==0, not long enough to dcbz128
248 sub. r7,r9,r0 // get #bytes remaining to 128-byte align
249 rlwinm r4,r2,0,0x7F // r4 <- length remaining after dcbz128'ing
250 mtctr r8 // set up dcbz128 loop
251 beq bz_dcbz128 // already 128-byte aligned
252 b bz_align // enter loop over 16-byte chunks
253
254// 128-byte align by looping over 16-byte chunks.
255
256 .align 5
257bz_align: // loop over 16-byte chunks
258 subic. r7,r7,16 // more to go?
259 std r6,0(r5)
260 std r6,8(r5)
261 addi r5,r5,16
262 bgt bz_align
263
264 b bz_dcbz128 // enter dcbz128 loop
265
266// Loop over 128-byte cache lines.
267// r4 = length remaining after cache lines (0..127)
268// r5 = ptr (128-byte aligned)
269// r6 = 0
270// ctr = count of cache lines to 0
271
272 .align 5
273bz_dcbz128:
274 dcbz128 0,r5 // zero a 128-byte cache line
275 addi r5,r5,128
276 bdnz bz_dcbz128
277
278 b bzero_tail // handle leftovers
279
280
281// Handle memset() for nonzero values. This case is relatively infrequent;
282// the large majority of memset() calls are for 0.
283// r3 = ptr
284// r4 = count
285// r6 = value in lower byte (nonzero)
286
287memset1:
288 cmplwi r4,16 // too short to bother aligning?
289 rlwimi r6,r6,8,16,23 // replicate value to low 2 bytes
290 mr r5,r3 // make working copy of operand ptr
291 rlwimi r6,r6,16,0,15 // value now in all 4 bytes
292 blt bzero_tail // length<16, we won't be using "std"
293 mfsprg r10,2 // get feature flags
294 neg r7,r5 // start to compute #bytes to align
295 rlwinm r6,r6,0,1,0 // value now in all 8 bytes (if 64-bit)
296 andi. r0,r7,7 // r6 <- #bytes to doubleword align
297 stw r6,0(r5) // store 8 bytes to avoid a loop
298 stw r6,4(r5)
299 mtcrf 0x02,r10 // get pf64Bit flag etc in cr6
300 sub r4,r4,r0 // adjust count
301 add r5,r5,r0 // doubleword align ptr
302 b bzero_tail
303
304
305