]> git.saurik.com Git - apple/libc.git/blob - ppc/string/memset.s
Libc-391.tar.gz
[apple/libc.git] / ppc / string / memset.s
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 #include <machine/cpu_capabilities.h>
25
26 /* We use mode-independent "g" opcodes such as "srgi". These expand
27 * into word operations when targeting __ppc__, and into doubleword
28 * operations when targeting __ppc64__.
29 */
30 #include <architecture/ppc/mode_independent_asm.h>
31
32
33 .text
34 #define kShort 128 // threshold for calling commpage
35
36
37 /* ***************
38 * * M E M S E T *
39 * ***************
40 *
41 * Registers we use:
42 * r3 = original ptr, not changed since memset returns it
43 * r4 = count of bytes to set
44 * r7 = value to set
45 * r8 = working operand ptr
46 */
47
48 .globl _memset
49 .align 5
50 _memset: // void * memset(void *b, int c, size_t len);
51 andi. r7,r4,0xFF // copy value to working register, test for 0
52 mr r4,r5 // move length to working register
53 cmplgi cr1,r5,kShort // long enough to bother with _COMM_PAGE_MEMSET_PATTERN?
54 beqa++ _COMM_PAGE_BZERO // if (c==0), map to bzero()
55 rlwimi r7,r7,8,16,23 // replicate nonzero value to low 2 bytes
56 neg r5,r3 // start to compute #bytes to align
57 mr r8,r3 // make working copy of operand ptr
58 rlwimi r7,r7,16,0,15 // value now in all 4 bytes
59 blt cr1,Lmemset3 // too short to use commpage
60
61 // TEMPORARY HACK
62 // Operand is long enough to use _COMM_PAGE_MEMSET_PATTERN. During Tiger
63 // development, B&I uses Panther kernels on their builders but runs Tiger
64 // apps on it. So _COMM_PAGE_MEMSET_PATTERN may not be on this machine.
65 // Rather than patch build fleet kernels, we just test to see if it is there
66 // and use the short-operand case if not. We can remove the hack when Tiger ships.
67
68 lhz r10,_COMM_PAGE_VERSION(0) // REMOVE THIS LINE WHEN TIGER SHIPS
69 andi. r0,r5,0xF // r0 <- #bytes to align on quadword
70
71 // Align ptr and store enough so that we have an aligned 16-byte pattern.
72
73 stw r7,0(r8)
74 stw r7,4(r8)
75 stw r7,8(r8)
76 stw r7,12(r8)
77 cmpwi cr1,r10,1 // REMOVE THIS LINE WHEN TIGER SHIPS
78 beq Lmemset1 // skip if (r0==0), ie if r8 is 16-byte aligned
79 add r8,r8,r0 // 16-byte align ptr
80 sub r4,r4,r0 // adjust length
81 stw r7,0(r8) // now we can store an aligned 16-byte pattern
82 stw r7,4(r8)
83 stw r7,8(r8)
84 stw r7,12(r8)
85
86 // Call machine-specific commpage routine, which expects:
87 // r4 = count (>=32)
88 // r8 = ptr (16-byte aligned) to memory to store
89 // r9 = ptr (16-byte aligned) to 16-byte pattern to store
90 // When it returns:
91 // r3, r7, and r12 are preserved
92 // r4 and r8 are updated to reflect a residual count of from 0..31 bytes
93
94 Lmemset1:
95 mflr r12 // save return address
96 mr r9,r8 // point to 16-byte-aligned 16-byte pattern
97 addi r8,r8,16 // point to first unstored byte
98 subi r4,r4,16 // account for the aligned bytes we have stored
99 bnela++ cr1,_COMM_PAGE_MEMSET_PATTERN // CHANGE THIS LINE WHEN TIGER SHIPS
100 mtlr r12
101
102 // Here for short nonzero memset.
103 // r4 = count (<= kShort bytes)
104 // r7 = pattern in all four bytes
105 // r8 = ptr
106 Lmemset3:
107 srgi. r0,r4,4 // any 16-byte chunks?
108 mtcrf 0x01,r4 // move length remaining to cr7 so we can test bits
109 beq Lmemset5 // fewer than 16 bytes
110 mtctr r0
111 b Lmemset4 // enter loop
112
113 .align 5
114 Lmemset4: // loop over 16-byte chunks
115 stw r7,0(r8)
116 stw r7,4(r8)
117 stw r7,8(r8)
118 stw r7,12(r8)
119 addi r8,r8,16
120 bdnz++ Lmemset4
121
122 // Handle last 0..15 bytes.
123 Lmemset5:
124 bf 28,2f
125 stw r7,0(r8)
126 stw r7,4(r8)
127 addi r8,r8,8
128 2:
129 bf 29,3f
130 stw r7,0(r8)
131 addi r8,r8,4
132 3:
133 bf 30,4f
134 sth r7,0(r8)
135 addi r8,r8,2
136 4:
137 bflr 31
138 stb r7,0(r8)
139 blr
140
141
142 /* *************************************
143 * * _ M E M S E T _ P A T T E R N 1 6 *
144 * *************************************
145 *
146 * Used to store a 16-byte pattern in memory:
147 *
148 * void _memset_pattern16(void *b, const void *c16, size_t len);
149 *
150 * Where c16 points to the 16-byte pattern. None of the parameters need be aligned.
151 */
152
153 .globl __memset_pattern16
154 .align 5
155 __memset_pattern16:
156 cmplgi cr1,r5,kShort // check length
157 lwz r7,0(r4) // load pattern into (these remain lwz in 64-bit mode)
158 lwz r9,4(r4)
159 neg r6,r3 // start to compute ptr alignment
160 lwz r10,8(r4)
161 lwz r11,12(r4)
162 b __memset_pattern_common
163
164
165 /* ***********************************
166 * * _ M E M S E T _ P A T T E R N 8 *
167 * ***********************************
168 *
169 * Used to store an 8-byte pattern in memory:
170 *
171 * void _memset_pattern8(void *b, const void *c8, size_t len);
172 *
173 * Where c8 points to the 8-byte pattern. None of the parameters need be aligned.
174 */
175
176 .globl __memset_pattern8
177 .align 5
178 __memset_pattern8:
179 lwz r7,0(r4) // load pattern (these remain lwz in 64-bit mode)
180 lwz r9,4(r4)
181 cmplgi cr1,r5,kShort // check length
182 neg r6,r3 // start to compute ptr alignment
183 mr r10,r7 // replicate into 16-byte pattern
184 mr r11,r9
185 b __memset_pattern_common
186
187
188 /* ***********************************
189 * * _ M E M S E T _ P A T T E R N 4 *
190 * ***********************************
191 *
192 * Used to store a 4-byte pattern in memory:
193 *
194 * void _memset_pattern4(void *b, const void *c4, size_t len);
195 *
196 * Where c4 points to the 4-byte pattern. None of the parameters need be aligned.
197 */
198
199 .globl __memset_pattern4
200 .align 5
201 __memset_pattern4:
202 lwz r7,0(r4) // load pattern
203 cmplgi cr1,r5,kShort // check length
204 neg r6,r3 // start to compute ptr alignment
205 mr r9,r7 // replicate into 16-byte pattern
206 mr r10,r7
207 mr r11,r7
208 b __memset_pattern_common // don't fall through because of scatter-loading
209
210
211 /* ***********************************************
212 * * _ M E M S E T _ P A T T E R N _ C O M M O N *
213 * ***********************************************
214 *
215 * This is the common code used by _memset_patter16, 8, and 4. They all get here via
216 * long branch (ie, "b") in case the routines are re-ordered, with:
217 * r3 = ptr to memory to store pattern into (unaligned)
218 * r5 = length in bytes
219 * r6 = neg(r3), used to compute #bytes to align
220 * r7, r9, r10, r11 = 16-byte pattern to store
221 * cr1= ble if (r5 <= kShort)
222 */
223
224 .globl __memset_pattern_common
225 .align 5
226 __memset_pattern_common:
227 andi. r0,r6,0xF // get #bytes to 16-byte align ptr
228 ble-- cr1,LShort // if short operand skip out
229
230 // Align ptr and store enough of pattern so we have an aligned
231 // 16-byte chunk of it (this effectively rotates incoming pattern
232 // if the original ptr was not aligned.)
233
234 stw r7,0(r3)
235 stw r9,4(r3)
236 stw r10,8(r3)
237 stw r11,12(r3)
238 beq Laligned // skip if (r0==0), ie if r3 is 16-byte aligned
239 stw r7,16(r3)
240 stw r9,20(r3)
241 stw r10,24(r3)
242 stw r11,28(r3)
243 add r3,r3,r0 // 16-byte align ptr
244 sub r5,r5,r0 // adjust length
245
246 // We're ready to call the machine-specific commpage routine
247 // to do the heavy lifting. When called, _COMM_PAGE_MEMSET_PATTERN expects:
248 // r4 = length (>= 32)
249 // r8 = ptr (16-byte aligned)
250 // r9 = ptr to 16-byte pattern (16-byte aligned)
251 // When it returns:
252 // r3, r7, and r12 are preserved
253 // r4 and r8 are updated to reflect a residual count of from 0..31 bytes
254
255 Laligned:
256 mflr r12 // save return across commpage call
257 mr r9,r3 // point to 16-byte aligned 16-byte pattern
258 addi r8,r3,16 // point to first unstored byte (r8 is 16-byte aligned)
259 subi r4,r5,16 // account for the aligned bytes we have stored
260 bla _COMM_PAGE_MEMSET_PATTERN
261 mr. r5,r4 // move length (0..31) back to original reg and test for 0
262 mtlr r12
263 beqlr // done if residual length == 0
264 lwz r7,-16(r8) // load aligned pattern into r7,r9,r10, and r11
265 lwz r9,-12(r8)
266 mr r3,r8 // move destination ptr back
267 lwz r10,-8(r8)
268 lwz r11,-4(r8)
269
270 // Handle short operands and leftovers.
271 // r3 = dest
272 // r5 = length
273 // r7,r9,r10,r11 = pattern
274 LShort:
275 srgi. r0,r5,4 // at least 16 bytes?
276 mtcrf 0x01,r5 // move leftover count to cr7
277 beq Lleftovers
278 mtctr r0
279 LShortLoop:
280 stw r7,0(r3) // replicate the pattern
281 stw r9,4(r3)
282 stw r10,8(r3)
283 stw r11,12(r3)
284 addi r3,r3,16
285 bdnz LShortLoop // store 16 more bytes
286
287 // Fewer than 16 bytes remaining.
288 Lleftovers:
289 bf 28,1f
290 stw r7,0(r3) // store next 8 bytes
291 stw r9,4(r3)
292 addi r3,r3,8
293 mr r7,r10 // shift pattern over
294 mr r9,r11
295 1:
296 bf 29,2f
297 stw r7,0(r3)
298 addi r3,r3,4
299 mr r7,r9
300 2:
301 bf 30,3f
302 rlwinm r7,r7,16,0,31 // position leftmost 2 bytes for store
303 sth r7,0(r3)
304 addi r3,r3,2
305 3:
306 bflr 31
307 srwi r7,r7,24 // position leftmost byte for store
308 stb r7,0(r3)
309 blr