]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/bzero.s
xnu-1504.3.12.tar.gz
[apple/xnu.git] / osfmk / ppc / bzero.s
1 /*
2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <ppc/asm.h>
30 #include <ppc/exception.h>
31 #include <assym.s>
32
33 .text
34 .align 2
35 .globl _memset
36 .globl _bzero
37 .globl _bzero_nc
38 .globl _bzero_phys
39 .globl _bzero_phys_nc
40
41
42 // *****************************
43 // * B Z E R O _ P H Y S _ N C *
44 // *****************************
45 //
46 // void bzero_phys_nc(addr64_t phys_addr, uint32_t length);
47 //
48 // Takes a phys addr in (r3,r4), and length in r5. NO CACHING
49
50 .align 5
51 LEXT(bzero_phys_nc)
52 mflr r12 // save return address
53 rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3
54 rlwimi r3,r4,0,0,31
55 mr r4,r5 // put length where bzero() expects it
56 bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11
57 bl EXT(bzero_nc) // use normal bzero() routine
58 mtlr r12 // restore return
59 b EXT(ml_restore) // restore MSR, turning DR on and SF off
60
61
62 // ***********************
63 // * B Z E R O _ P H Y S *
64 // ***********************
65 //
66 // void bzero_phys(addr64_t phys_addr, uint32_t length);
67 //
68 // Takes a phys addr in (r3,r4), and length in r5. We leave cache on.
69
70 .align 5
71 LEXT(bzero_phys)
72 mflr r12 // save return address
73 rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3
74 rlwimi r3,r4,0,0,31
75 mr r4,r5 // put length where bzero() expects it
76 bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11
77 bl EXT(bzero) // use normal bzero() routine
78 mtlr r12 // restore return
79 b EXT(ml_restore) // restore MSR, turning DR on and SF off
80
81
82 // *******************
83 // * B Z E R O _ N C *
84 // *******************
85 //
86 // void bzero_nc(char *addr, unsigned int length);
87 //
88 // For use with uncached memory. Doesn't seem to be used at all, so probably not
89 // performance critical. NB: we must avoid unaligned stores, because some
90 // machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
91 // memory. Of course, we must also avoid dcbz.
92
93 LEXT(bzero_nc)
94 cmplwi cr1,r4,20 // too short to bother with 16-byte loops?
95 cmplwi cr7,r4,0 // check for (len==0)
96 li r6,0 // get a 0
97 bge cr1,bznc1 // skip if length >=20
98 mtctr r4 // set up byte loop
99 beqlr-- cr7 // done if len=0
100
101 // Short operands, loop over bytes.
102
103 bznc0:
104 stb r6,0(r3)
105 addi r3,r3,1
106 bdnz bznc0
107 blr
108
109 // Handle operands long enough to do doubleword stores; we must doubleword
110 // align, to avoid alignment exceptions.
111
112 bznc1:
113 neg r7,r3 // start to compute #bytes to align
114 mfsprg r10,2 // get feature flags
115 andi. r0,r7,7 // get #bytes to doubleword align
116 mr r5,r3 // make copy of operand ptr as bcopy expects
117 mtcrf 0x02,r10 // put pf64Bitb etc in cr6
118 beq bzero_tail // already doubleword aligned
119 sub r4,r4,r0 // adjust count
120 mtctr r0 // set up loop
121 bznc2: // zero bytes until doubleword aligned
122 stb r6,0(r5)
123 addi r5,r5,1
124 bdnz bznc2
125 b bzero_tail // join bzero, now that r5 is aligned
126
127
128 // ************* ***************
129 // * B Z E R O * and * M E M S E T *
130 // ************* ***************
131 //
132 // void * memset(void *b, int c, size_t len);
133 // void bzero(void *b, size_t len);
134 //
135 // These routines support G3, G4, and the 970, and run in both 32 and
136 // 64-bit mode. Lengths (size_t) are always 32 bits.
137 //
138 // Register use:
139 // r0 = temp
140 // r2 = temp
141 // r3 = original ptr, not changed since memset returns it
142 // r4 = count of bytes to set
143 // r5 = working operand ptr ("rp")
144 // r6 = value to store (usually 0)
145 // r7-r9 = temps
146 // r10 = feature flags
147 // r11 = old MSR (if bzero_phys)
148 // r12 = return address (if bzero_phys)
149 // cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
150
151 .align 5
152 LEXT(memset) // void * memset(void *b, int c, size_t len);
153 andi. r6,r4,0xFF // copy value to working register, test for 0
154 mr r4,r5 // move length to working register
155 bne-- memset1 // skip if nonzero
156 LEXT(bzero) // void bzero(void *b, size_t len);
157 dcbtst 0,r3 // touch in 1st cache block
158 mfsprg r10,2 // get features
159 li r6,0 // get a 0
160 neg r7,r3 // start to compute #bytes to align
161 andi. r0,r10,pf128Byte+pf32Byte // get cache line size
162 mtcrf 0x02,r10 // put pf128Byte etc in cr6
163 cmplw r4,r0 // operand length >= cache line size?
164 mr r5,r3 // make copy of operand ptr (can't change r3)
165 blt bzero_tail // too short for dcbz (or dcbz128)
166 rlwinm r0,r7,0,0x1F // get #bytes to 32-byte align
167 rlwinm r9,r7,0,0x7F // get #bytes to 128-byte align
168 bt++ pf128Byteb,bzero_128 // skip if 128-byte processor
169
170 // Operand length >=32 and cache line size is 32.
171 // r0 = #bytes to 32-byte align
172 // r4 = length
173 // r5 = ptr to operand
174 // r6 = 0
175
176 sub r2,r4,r0 // adjust length
177 cmpwi cr1,r0,0 // already 32-byte aligned?
178 srwi. r8,r2,5 // get #32-byte chunks
179 beq bzero_tail // not long enough to dcbz
180 mtctr r8 // set up loop count
181 rlwinm r4,r2,0,27,31 // mask down to leftover byte count
182 beq cr1,bz_dcbz32 // skip if already 32-byte aligned
183
184 // 32-byte align. We just store 32 0s, rather than test and use conditional
185 // branches. This is usually faster, because there are no mispredicts.
186
187 stw r6,0(r5) // zero next 32 bytes
188 stw r6,4(r5)
189 stw r6,8(r5)
190 stw r6,12(r5)
191 stw r6,16(r5)
192 stw r6,20(r5)
193 stw r6,24(r5)
194 stw r6,28(r5)
195 add r5,r5,r0 // now r5 is 32-byte aligned
196 b bz_dcbz32
197
198 // Loop doing 32-byte version of DCBZ instruction.
199
200 .align 4 // align the inner loop
201 bz_dcbz32:
202 dcbz 0,r5 // zero another 32 bytes
203 addi r5,r5,32
204 bdnz bz_dcbz32
205
206 // Store trailing bytes. This routine is used both by bzero and memset.
207 // r4 = #bytes to store (may be large if memset)
208 // r5 = address
209 // r6 = value to store (in all 8 bytes)
210 // cr6 = pf64Bit etc flags
211
212 bzero_tail:
213 srwi. r0,r4,4 // get #(16-byte-chunks)
214 mtcrf 0x01,r4 // remaining byte count to cr7
215 beq bzt3 // no 16-byte chunks
216 mtctr r0 // set up loop count
217 bt++ pf64Bitb,bzt2 // skip if 64-bit processor
218 b bzt1
219 .align 5
220 bzt1: // loop over 16-byte chunks on 32-bit processor
221 stw r6,0(r5)
222 stw r6,4(r5)
223 stw r6,8(r5)
224 stw r6,12(r5)
225 addi r5,r5,16
226 bdnz bzt1
227 b bzt3
228 .align 5
229 bzt2: // loop over 16-byte chunks on 64-bit processor
230 std r6,0(r5)
231 std r6,8(r5)
232 addi r5,r5,16
233 bdnz bzt2
234 bf 28,bzt4 // 8-byte chunk?
235 std r6,0(r5)
236 addi r5,r5,8
237 b bzt4
238 bzt3:
239 bf 28,bzt4 // 8-byte chunk?
240 stw r6,0(r5)
241 stw r6,4(r5)
242 addi r5,r5,8
243 bzt4:
244 bf 29,bzt5 // word?
245 stw r6,0(r5)
246 addi r5,r5,4
247 bzt5:
248 bf 30,bzt6 // halfword?
249 sth r6,0(r5)
250 addi r5,r5,2
251 bzt6:
252 bflr 31 // byte?
253 stb r6,0(r5)
254 blr
255
256 // Operand length is >=128 and cache line size is 128. We assume that
257 // because the linesize is 128 bytes, this is a 64-bit processor.
258 // r4 = length
259 // r5 = ptr to operand
260 // r6 = 0
261 // r7 = neg(r5)
262 // r9 = #bytes to 128-byte align
263
264 .align 5
265 bzero_128:
266 sub r2,r4,r9 // r2 <- length remaining after cache-line aligning
267 rlwinm r0,r7,0,0xF // r0 <- #bytes to 16-byte align
268 srwi. r8,r2,7 // r8 <- number of cache lines to 0
269 std r6,0(r5) // always store 16 bytes to 16-byte align...
270 std r6,8(r5) // ...even if too short for dcbz128
271 add r5,r5,r0 // 16-byte align ptr
272 sub r4,r4,r0 // adjust count
273 beq bzero_tail // r8==0, not long enough to dcbz128
274 sub. r7,r9,r0 // get #bytes remaining to 128-byte align
275 rlwinm r4,r2,0,0x7F // r4 <- length remaining after dcbz128'ing
276 mtctr r8 // set up dcbz128 loop
277 beq bz_dcbz128 // already 128-byte aligned
278 b bz_align // enter loop over 16-byte chunks
279
280 // 128-byte align by looping over 16-byte chunks.
281
282 .align 5
283 bz_align: // loop over 16-byte chunks
284 subic. r7,r7,16 // more to go?
285 std r6,0(r5)
286 std r6,8(r5)
287 addi r5,r5,16
288 bgt bz_align
289
290 b bz_dcbz128 // enter dcbz128 loop
291
292 // Loop over 128-byte cache lines.
293 // r4 = length remaining after cache lines (0..127)
294 // r5 = ptr (128-byte aligned)
295 // r6 = 0
296 // ctr = count of cache lines to 0
297
298 .align 5
299 bz_dcbz128:
300 dcbz128 0,r5 // zero a 128-byte cache line
301 addi r5,r5,128
302 bdnz bz_dcbz128
303
304 b bzero_tail // handle leftovers
305
306
307 // Handle memset() for nonzero values. This case is relatively infrequent;
308 // the large majority of memset() calls are for 0.
309 // r3 = ptr
310 // r4 = count
311 // r6 = value in lower byte (nonzero)
312
313 memset1:
314 cmplwi r4,16 // too short to bother aligning?
315 rlwimi r6,r6,8,16,23 // replicate value to low 2 bytes
316 mr r5,r3 // make working copy of operand ptr
317 rlwimi r6,r6,16,0,15 // value now in all 4 bytes
318 blt bzero_tail // length<16, we won't be using "std"
319 mfsprg r10,2 // get feature flags
320 neg r7,r5 // start to compute #bytes to align
321 rlwinm r6,r6,0,1,0 // value now in all 8 bytes (if 64-bit)
322 andi. r0,r7,7 // r6 <- #bytes to doubleword align
323 stw r6,0(r5) // store 8 bytes to avoid a loop
324 stw r6,4(r5)
325 mtcrf 0x02,r10 // get pf64Bit flag etc in cr6
326 sub r4,r4,r0 // adjust count
327 add r5,r5,r0 // doubleword align ptr
328 b bzero_tail
329
330
331