2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
27 #include <ppc/exception.h>
38 // ***********************
39 // * B Z E R O _ P H Y S *
40 // ***********************
42 // void bzero_phys(addr64_t phys_addr, uint32_t length);
44 // Takes a phys addr in (r3,r4), and length in r5. We leave cache on.
48 mflr r12 // save return address
49 rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3
51 mr r4,r5 // put length where bzero() expects it
52 bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11
53 bl EXT(bzero) // use normal bzero() routine
54 mtlr r12 // restore return
55 b EXT(ml_restore) // restore MSR, turning DR on and SF off
58 // *******************
59 // * B Z E R O _ N C *
60 // *******************
62 // void bzero_nc(char *addr, unsigned int length);
64 // For use with uncached memory. Doesn't seem to be used at all, so probably not
65 // performance critical. NB: we must avoid unaligned stores, because some
66 // machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
67 // memory. Of course, we must also avoid dcbz.
70 cmplwi cr1,r4,20 // too short to bother with 16-byte loops?
71 cmplwi cr7,r4,0 // check for (len==0)
73 bge cr1,bznc1 // skip if length >=20
74 mtctr r4 // set up byte loop
75 beqlr-- cr7 // done if len=0
77 // Short operands, loop over bytes.
85 // Handle operands long enough to do doubleword stores; we must doubleword
86 // align, to avoid alignment exceptions.
89 neg r7,r3 // start to compute #bytes to align
90 mfsprg r10,2 // get feature flags
91 andi. r0,r7,7 // get #bytes to doubleword align
92 mr r5,r3 // make copy of operand ptr as bcopy expects
93 mtcrf 0x02,r10 // put pf64Bitb etc in cr6
94 beq bzero_tail // already doubleword aligned
95 sub r4,r4,r0 // adjust count
96 mtctr r0 // set up loop
97 bznc2: // zero bytes until doubleword aligned
101 b bzero_tail // join bzero, now that r5 is aligned
104 // ************* ***************
105 // * B Z E R O * and * M E M S E T *
106 // ************* ***************
108 // void * memset(void *b, int c, size_t len);
109 // void bzero(void *b, size_t len);
111 // These routines support G3, G4, and the 970, and run in both 32 and
112 // 64-bit mode. Lengths (size_t) are always 32 bits.
117 // r3 = original ptr, not changed since memset returns it
118 // r4 = count of bytes to set
119 // r5 = working operand ptr ("rp")
120 // r6 = value to store (usually 0)
122 // r10 = feature flags
123 // r11 = old MSR (if bzero_phys)
124 // r12 = return address (if bzero_phys)
125 // cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
128 LEXT(memset) // void * memset(void *b, int c, size_t len);
129 andi. r6,r4,0xFF // copy value to working register, test for 0
130 mr r4,r5 // move length to working register
131 bne-- memset1 // skip if nonzero
132 LEXT(bzero) // void bzero(void *b, size_t len);
133 dcbtst 0,r3 // touch in 1st cache block
134 mfsprg r10,2 // get features
136 neg r7,r3 // start to compute #bytes to align
137 andi. r0,r10,pf128Byte+pf32Byte // get cache line size
138 mtcrf 0x02,r10 // put pf128Byte etc in cr6
139 cmplw r4,r0 // operand length >= cache line size?
140 mr r5,r3 // make copy of operand ptr (can't change r3)
141 blt bzero_tail // too short for dcbz (or dcbz128)
142 rlwinm r0,r7,0,0x1F // get #bytes to 32-byte align
143 rlwinm r9,r7,0,0x7F // get #bytes to 128-byte align
144 bt++ pf128Byteb,bzero_128 // skip if 128-byte processor
146 // Operand length >=32 and cache line size is 32.
147 // r0 = #bytes to 32-byte align
149 // r5 = ptr to operand
152 sub r2,r4,r0 // adjust length
153 cmpwi cr1,r0,0 // already 32-byte aligned?
154 srwi. r8,r2,5 // get #32-byte chunks
155 beq bzero_tail // not long enough to dcbz
156 mtctr r8 // set up loop count
157 rlwinm r4,r2,0,27,31 // mask down to leftover byte count
158 beq cr1,bz_dcbz32 // skip if already 32-byte aligned
160 // 32-byte align. We just store 32 0s, rather than test and use conditional
161 // branches. This is usually faster, because there are no mispredicts.
163 stw r6,0(r5) // zero next 32 bytes
171 add r5,r5,r0 // now r5 is 32-byte aligned
174 // Loop doing 32-byte version of DCBZ instruction.
176 .align 4 // align the inner loop
178 dcbz 0,r5 // zero another 32 bytes
182 // Store trailing bytes. This routine is used both by bzero and memset.
183 // r4 = #bytes to store (may be large if memset)
185 // r6 = value to store (in all 8 bytes)
186 // cr6 = pf64Bit etc flags
189 srwi. r0,r4,4 // get #(16-byte-chunks)
190 mtcrf 0x01,r4 // remaining byte count to cr7
191 beq bzt3 // no 16-byte chunks
192 mtctr r0 // set up loop count
193 bt++ pf64Bitb,bzt2 // skip if 64-bit processor
196 bzt1: // loop over 16-byte chunks on 32-bit processor
205 bzt2: // loop over 16-byte chunks on 64-bit processor
210 bf 28,bzt4 // 8-byte chunk?
215 bf 28,bzt4 // 8-byte chunk?
224 bf 30,bzt6 // halfword?
232 // Operand length is >=128 and cache line size is 128. We assume that
233 // because the linesize is 128 bytes, this is a 64-bit processor.
235 // r5 = ptr to operand
238 // r9 = #bytes to 128-byte align
242 sub r2,r4,r9 // r2 <- length remaining after cache-line aligning
243 rlwinm r0,r7,0,0xF // r0 <- #bytes to 16-byte align
244 srwi. r8,r2,7 // r8 <- number of cache lines to 0
245 std r6,0(r5) // always store 16 bytes to 16-byte align...
246 std r6,8(r5) // ...even if too short for dcbz128
247 add r5,r5,r0 // 16-byte align ptr
248 sub r4,r4,r0 // adjust count
249 beq bzero_tail // r8==0, not long enough to dcbz128
250 sub. r7,r9,r0 // get #bytes remaining to 128-byte align
251 rlwinm r4,r2,0,0x7F // r4 <- length remaining after dcbz128'ing
252 mtctr r8 // set up dcbz128 loop
253 beq bz_dcbz128 // already 128-byte aligned
254 b bz_align // enter loop over 16-byte chunks
256 // 128-byte align by looping over 16-byte chunks.
259 bz_align: // loop over 16-byte chunks
260 subic. r7,r7,16 // more to go?
266 b bz_dcbz128 // enter dcbz128 loop
268 // Loop over 128-byte cache lines.
269 // r4 = length remaining after cache lines (0..127)
270 // r5 = ptr (128-byte aligned)
272 // ctr = count of cache lines to 0
276 dcbz128 0,r5 // zero a 128-byte cache line
280 b bzero_tail // handle leftovers
283 // Handle memset() for nonzero values. This case is relatively infrequent;
284 // the large majority of memset() calls are for 0.
287 // r6 = value in lower byte (nonzero)
290 cmplwi r4,16 // too short to bother aligning?
291 rlwimi r6,r6,8,16,23 // replicate value to low 2 bytes
292 mr r5,r3 // make working copy of operand ptr
293 rlwimi r6,r6,16,0,15 // value now in all 4 bytes
294 blt bzero_tail // length<16, we won't be using "std"
295 mfsprg r10,2 // get feature flags
296 neg r7,r5 // start to compute #bytes to align
297 rlwinm r6,r6,0,1,0 // value now in all 8 bytes (if 64-bit)
298 andi. r0,r7,7 // r6 <- #bytes to doubleword align
299 stw r6,0(r5) // store 8 bytes to avoid a loop
301 mtcrf 0x02,r10 // get pf64Bit flag etc in cr6
302 sub r4,r4,r0 // adjust count
303 add r5,r5,r0 // doubleword align ptr