]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
55e303ae | 2 | * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. |
1c79356b | 3 | * |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
1c79356b | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
8f6c56a5 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
8f6c56a5 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
1c79356b | 27 | */ |
1c79356b A |
28 | |
29 | #include <ppc/asm.h> | |
55e303ae A |
30 | #include <ppc/exception.h> |
31 | #include <assym.s> | |
32 | ||
33 | .text | |
34 | .align 2 | |
35 | .globl _memset | |
36 | .globl _bzero | |
37 | .globl _bzero_nc | |
38 | .globl _bzero_phys | |
2d21ac55 A |
39 | .globl _bzero_phys_nc |
40 | ||
55e303ae | 41 | |
2d21ac55 A |
42 | // ***************************** |
43 | // * B Z E R O _ P H Y S _ N C * | |
44 | // ***************************** | |
45 | // | |
46 | // void bzero_phys_nc(addr64_t phys_addr, uint32_t length); | |
47 | // | |
48 | // Takes a phys addr in (r3,r4), and length in r5. NO CACHING | |
49 | ||
50 | .align 5 | |
51 | LEXT(bzero_phys_nc) | |
52 | mflr r12 // save return address | |
53 | rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3 | |
54 | rlwimi r3,r4,0,0,31 | |
55 | mr r4,r5 // put length where bzero() expects it | |
56 | bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11 | |
57 | bl EXT(bzero_nc) // use normal bzero() routine | |
58 | mtlr r12 // restore return | |
59 | b EXT(ml_restore) // restore MSR, turning DR on and SF off | |
60 | ||
55e303ae A |
61 | |
62 | // *********************** | |
63 | // * B Z E R O _ P H Y S * | |
64 | // *********************** | |
65 | // | |
66 | // void bzero_phys(addr64_t phys_addr, uint32_t length); | |
67 | // | |
68 | // Takes a phys addr in (r3,r4), and length in r5. We leave cache on. | |
69 | ||
70 | .align 5 | |
71 | LEXT(bzero_phys) | |
72 | mflr r12 // save return address | |
73 | rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3 | |
74 | rlwimi r3,r4,0,0,31 | |
75 | mr r4,r5 // put length where bzero() expects it | |
76 | bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11 | |
77 | bl EXT(bzero) // use normal bzero() routine | |
78 | mtlr r12 // restore return | |
79 | b EXT(ml_restore) // restore MSR, turning DR on and SF off | |
80 | ||
81 | ||
82 | // ******************* | |
83 | // * B Z E R O _ N C * | |
84 | // ******************* | |
85 | // | |
86 | // void bzero_nc(char *addr, unsigned int length); | |
87 | // | |
88 | // For use with uncached memory. Doesn't seem to be used at all, so probably not | |
89 | // performance critical. NB: we must avoid unaligned stores, because some | |
90 | // machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached | |
91 | // memory. Of course, we must also avoid dcbz. | |
92 | ||
93 | LEXT(bzero_nc) | |
94 | cmplwi cr1,r4,20 // too short to bother with 16-byte loops? | |
95 | cmplwi cr7,r4,0 // check for (len==0) | |
96 | li r6,0 // get a 0 | |
97 | bge cr1,bznc1 // skip if length >=20 | |
98 | mtctr r4 // set up byte loop | |
99 | beqlr-- cr7 // done if len=0 | |
100 | ||
101 | // Short operands, loop over bytes. | |
102 | ||
103 | bznc0: | |
104 | stb r6,0(r3) | |
105 | addi r3,r3,1 | |
106 | bdnz bznc0 | |
107 | blr | |
108 | ||
109 | // Handle operands long enough to do doubleword stores; we must doubleword | |
110 | // align, to avoid alignment exceptions. | |
111 | ||
112 | bznc1: | |
113 | neg r7,r3 // start to compute #bytes to align | |
114 | mfsprg r10,2 // get feature flags | |
115 | andi. r0,r7,7 // get #bytes to doubleword align | |
116 | mr r5,r3 // make copy of operand ptr as bcopy expects | |
117 | mtcrf 0x02,r10 // put pf64Bitb etc in cr6 | |
118 | beq bzero_tail // already doubleword aligned | |
119 | sub r4,r4,r0 // adjust count | |
120 | mtctr r0 // set up loop | |
121 | bznc2: // zero bytes until doubleword aligned | |
122 | stb r6,0(r5) | |
123 | addi r5,r5,1 | |
124 | bdnz bznc2 | |
125 | b bzero_tail // join bzero, now that r5 is aligned | |
126 | ||
127 | ||
128 | // ************* *************** | |
129 | // * B Z E R O * and * M E M S E T * | |
130 | // ************* *************** | |
131 | // | |
132 | // void * memset(void *b, int c, size_t len); | |
133 | // void bzero(void *b, size_t len); | |
134 | // | |
135 | // These routines support G3, G4, and the 970, and run in both 32 and | |
136 | // 64-bit mode. Lengths (size_t) are always 32 bits. | |
137 | // | |
138 | // Register use: | |
139 | // r0 = temp | |
140 | // r2 = temp | |
141 | // r3 = original ptr, not changed since memset returns it | |
142 | // r4 = count of bytes to set | |
143 | // r5 = working operand ptr ("rp") | |
144 | // r6 = value to store (usually 0) | |
145 | // r7-r9 = temps | |
146 | // r10 = feature flags | |
147 | // r11 = old MSR (if bzero_phys) | |
148 | // r12 = return address (if bzero_phys) | |
149 | // cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte) | |
150 | ||
151 | .align 5 | |
152 | LEXT(memset) // void * memset(void *b, int c, size_t len); | |
153 | andi. r6,r4,0xFF // copy value to working register, test for 0 | |
154 | mr r4,r5 // move length to working register | |
155 | bne-- memset1 // skip if nonzero | |
156 | LEXT(bzero) // void bzero(void *b, size_t len); | |
157 | dcbtst 0,r3 // touch in 1st cache block | |
158 | mfsprg r10,2 // get features | |
159 | li r6,0 // get a 0 | |
160 | neg r7,r3 // start to compute #bytes to align | |
161 | andi. r0,r10,pf128Byte+pf32Byte // get cache line size | |
162 | mtcrf 0x02,r10 // put pf128Byte etc in cr6 | |
163 | cmplw r4,r0 // operand length >= cache line size? | |
164 | mr r5,r3 // make copy of operand ptr (can't change r3) | |
165 | blt bzero_tail // too short for dcbz (or dcbz128) | |
166 | rlwinm r0,r7,0,0x1F // get #bytes to 32-byte align | |
167 | rlwinm r9,r7,0,0x7F // get #bytes to 128-byte align | |
168 | bt++ pf128Byteb,bzero_128 // skip if 128-byte processor | |
169 | ||
170 | // Operand length >=32 and cache line size is 32. | |
171 | // r0 = #bytes to 32-byte align | |
172 | // r4 = length | |
173 | // r5 = ptr to operand | |
174 | // r6 = 0 | |
175 | ||
176 | sub r2,r4,r0 // adjust length | |
177 | cmpwi cr1,r0,0 // already 32-byte aligned? | |
178 | srwi. r8,r2,5 // get #32-byte chunks | |
179 | beq bzero_tail // not long enough to dcbz | |
180 | mtctr r8 // set up loop count | |
181 | rlwinm r4,r2,0,27,31 // mask down to leftover byte count | |
182 | beq cr1,bz_dcbz32 // skip if already 32-byte aligned | |
183 | ||
184 | // 32-byte align. We just store 32 0s, rather than test and use conditional | |
185 | // branches. This is usually faster, because there are no mispredicts. | |
186 | ||
187 | stw r6,0(r5) // zero next 32 bytes | |
188 | stw r6,4(r5) | |
189 | stw r6,8(r5) | |
190 | stw r6,12(r5) | |
191 | stw r6,16(r5) | |
192 | stw r6,20(r5) | |
193 | stw r6,24(r5) | |
194 | stw r6,28(r5) | |
195 | add r5,r5,r0 // now r5 is 32-byte aligned | |
196 | b bz_dcbz32 | |
197 | ||
198 | // Loop doing 32-byte version of DCBZ instruction. | |
199 | ||
200 | .align 4 // align the inner loop | |
201 | bz_dcbz32: | |
202 | dcbz 0,r5 // zero another 32 bytes | |
203 | addi r5,r5,32 | |
204 | bdnz bz_dcbz32 | |
205 | ||
206 | // Store trailing bytes. This routine is used both by bzero and memset. | |
207 | // r4 = #bytes to store (may be large if memset) | |
208 | // r5 = address | |
209 | // r6 = value to store (in all 8 bytes) | |
210 | // cr6 = pf64Bit etc flags | |
211 | ||
212 | bzero_tail: | |
213 | srwi. r0,r4,4 // get #(16-byte-chunks) | |
214 | mtcrf 0x01,r4 // remaining byte count to cr7 | |
215 | beq bzt3 // no 16-byte chunks | |
216 | mtctr r0 // set up loop count | |
217 | bt++ pf64Bitb,bzt2 // skip if 64-bit processor | |
218 | b bzt1 | |
219 | .align 5 | |
220 | bzt1: // loop over 16-byte chunks on 32-bit processor | |
221 | stw r6,0(r5) | |
222 | stw r6,4(r5) | |
223 | stw r6,8(r5) | |
224 | stw r6,12(r5) | |
225 | addi r5,r5,16 | |
226 | bdnz bzt1 | |
227 | b bzt3 | |
228 | .align 5 | |
229 | bzt2: // loop over 16-byte chunks on 64-bit processor | |
230 | std r6,0(r5) | |
231 | std r6,8(r5) | |
232 | addi r5,r5,16 | |
233 | bdnz bzt2 | |
234 | bf 28,bzt4 // 8-byte chunk? | |
235 | std r6,0(r5) | |
236 | addi r5,r5,8 | |
237 | b bzt4 | |
238 | bzt3: | |
239 | bf 28,bzt4 // 8-byte chunk? | |
240 | stw r6,0(r5) | |
241 | stw r6,4(r5) | |
242 | addi r5,r5,8 | |
243 | bzt4: | |
244 | bf 29,bzt5 // word? | |
245 | stw r6,0(r5) | |
246 | addi r5,r5,4 | |
247 | bzt5: | |
248 | bf 30,bzt6 // halfword? | |
249 | sth r6,0(r5) | |
250 | addi r5,r5,2 | |
251 | bzt6: | |
252 | bflr 31 // byte? | |
253 | stb r6,0(r5) | |
254 | blr | |
255 | ||
256 | // Operand length is >=128 and cache line size is 128. We assume that | |
257 | // because the linesize is 128 bytes, this is a 64-bit processor. | |
258 | // r4 = length | |
259 | // r5 = ptr to operand | |
260 | // r6 = 0 | |
261 | // r7 = neg(r5) | |
262 | // r9 = #bytes to 128-byte align | |
263 | ||
264 | .align 5 | |
265 | bzero_128: | |
266 | sub r2,r4,r9 // r2 <- length remaining after cache-line aligning | |
267 | rlwinm r0,r7,0,0xF // r0 <- #bytes to 16-byte align | |
268 | srwi. r8,r2,7 // r8 <- number of cache lines to 0 | |
269 | std r6,0(r5) // always store 16 bytes to 16-byte align... | |
270 | std r6,8(r5) // ...even if too short for dcbz128 | |
271 | add r5,r5,r0 // 16-byte align ptr | |
272 | sub r4,r4,r0 // adjust count | |
273 | beq bzero_tail // r8==0, not long enough to dcbz128 | |
274 | sub. r7,r9,r0 // get #bytes remaining to 128-byte align | |
275 | rlwinm r4,r2,0,0x7F // r4 <- length remaining after dcbz128'ing | |
276 | mtctr r8 // set up dcbz128 loop | |
277 | beq bz_dcbz128 // already 128-byte aligned | |
278 | b bz_align // enter loop over 16-byte chunks | |
279 | ||
280 | // 128-byte align by looping over 16-byte chunks. | |
281 | ||
282 | .align 5 | |
283 | bz_align: // loop over 16-byte chunks | |
284 | subic. r7,r7,16 // more to go? | |
285 | std r6,0(r5) | |
286 | std r6,8(r5) | |
287 | addi r5,r5,16 | |
288 | bgt bz_align | |
289 | ||
290 | b bz_dcbz128 // enter dcbz128 loop | |
291 | ||
292 | // Loop over 128-byte cache lines. | |
293 | // r4 = length remaining after cache lines (0..127) | |
294 | // r5 = ptr (128-byte aligned) | |
295 | // r6 = 0 | |
296 | // ctr = count of cache lines to 0 | |
297 | ||
298 | .align 5 | |
299 | bz_dcbz128: | |
300 | dcbz128 0,r5 // zero a 128-byte cache line | |
301 | addi r5,r5,128 | |
302 | bdnz bz_dcbz128 | |
303 | ||
304 | b bzero_tail // handle leftovers | |
305 | ||
306 | ||
307 | // Handle memset() for nonzero values. This case is relatively infrequent; | |
308 | // the large majority of memset() calls are for 0. | |
309 | // r3 = ptr | |
310 | // r4 = count | |
311 | // r6 = value in lower byte (nonzero) | |
312 | ||
313 | memset1: | |
314 | cmplwi r4,16 // too short to bother aligning? | |
315 | rlwimi r6,r6,8,16,23 // replicate value to low 2 bytes | |
316 | mr r5,r3 // make working copy of operand ptr | |
317 | rlwimi r6,r6,16,0,15 // value now in all 4 bytes | |
318 | blt bzero_tail // length<16, we won't be using "std" | |
319 | mfsprg r10,2 // get feature flags | |
320 | neg r7,r5 // start to compute #bytes to align | |
321 | rlwinm r6,r6,0,1,0 // value now in all 8 bytes (if 64-bit) | |
322 | andi. r0,r7,7 // r6 <- #bytes to doubleword align | |
323 | stw r6,0(r5) // store 8 bytes to avoid a loop | |
324 | stw r6,4(r5) | |
325 | mtcrf 0x02,r10 // get pf64Bit flag etc in cr6 | |
326 | sub r4,r4,r0 // adjust count | |
327 | add r5,r5,r0 // doubleword align ptr | |
328 | b bzero_tail | |
329 | ||
330 | ||
9bccf70c | 331 |