]>
Commit | Line | Data |
---|---|---|
3d9156a7 A |
1 | /* |
2 | * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | #include <machine/cpu_capabilities.h> | |
25 | ||
26 | /* We use mode-independent "g" opcodes such as "srgi". These expand | |
27 | * into word operations when targeting __ppc__, and into doubleword | |
28 | * operations when targeting __ppc64__. | |
29 | */ | |
30 | #include <architecture/ppc/mode_independent_asm.h> | |
31 | ||
32 | ||
33 | .text | |
34 | #define kShort 128 // threshold for calling commpage | |
35 | ||
36 | ||
37 | /* *************** | |
38 | * * M E M S E T * | |
39 | * *************** | |
40 | * | |
41 | * Registers we use: | |
42 | * r3 = original ptr, not changed since memset returns it | |
43 | * r4 = count of bytes to set | |
44 | * r7 = value to set | |
45 | * r8 = working operand ptr | |
46 | */ | |
47 | ||
48 | .globl _memset | |
49 | .align 5 | |
50 | _memset: // void * memset(void *b, int c, size_t len); | |
51 | andi. r7,r4,0xFF // copy value to working register, test for 0 | |
224c7076 | 52 | mr r4,r5 // move length to working register |
3d9156a7 A |
53 | cmplgi cr1,r5,kShort // long enough to bother with _COMM_PAGE_MEMSET_PATTERN? |
54 | beqa++ _COMM_PAGE_BZERO // if (c==0), map to bzero() | |
55 | rlwimi r7,r7,8,16,23 // replicate nonzero value to low 2 bytes | |
224c7076 A |
56 | neg r5,r3 // start to compute #bytes to align |
57 | mr r8,r3 // make working copy of operand ptr | |
3d9156a7 | 58 | rlwimi r7,r7,16,0,15 // value now in all 4 bytes |
224c7076 | 59 | blt cr1,Lmemset3 // too short to use commpage |
3d9156a7 A |
60 | andi. r0,r5,0xF // r0 <- #bytes to align on quadword |
61 | ||
62 | // Align ptr and store enough so that we have an aligned 16-byte pattern. | |
63 | ||
64 | stw r7,0(r8) | |
65 | stw r7,4(r8) | |
66 | stw r7,8(r8) | |
67 | stw r7,12(r8) | |
3d9156a7 A |
68 | beq Lmemset1 // skip if (r0==0), ie if r8 is 16-byte aligned |
69 | add r8,r8,r0 // 16-byte align ptr | |
70 | sub r4,r4,r0 // adjust length | |
71 | stw r7,0(r8) // now we can store an aligned 16-byte pattern | |
72 | stw r7,4(r8) | |
73 | stw r7,8(r8) | |
74 | stw r7,12(r8) | |
75 | ||
76 | // Call machine-specific commpage routine, which expects: | |
77 | // r4 = count (>=32) | |
78 | // r8 = ptr (16-byte aligned) to memory to store | |
79 | // r9 = ptr (16-byte aligned) to 16-byte pattern to store | |
80 | // When it returns: | |
81 | // r3, r7, and r12 are preserved | |
82 | // r4 and r8 are updated to reflect a residual count of from 0..31 bytes | |
83 | ||
84 | Lmemset1: | |
85 | mflr r12 // save return address | |
86 | mr r9,r8 // point to 16-byte-aligned 16-byte pattern | |
87 | addi r8,r8,16 // point to first unstored byte | |
88 | subi r4,r4,16 // account for the aligned bytes we have stored | |
224c7076 | 89 | bla _COMM_PAGE_MEMSET_PATTERN |
3d9156a7 A |
90 | mtlr r12 |
91 | ||
92 | // Here for short nonzero memset. | |
93 | // r4 = count (<= kShort bytes) | |
94 | // r7 = pattern in all four bytes | |
95 | // r8 = ptr | |
96 | Lmemset3: | |
97 | srgi. r0,r4,4 // any 16-byte chunks? | |
98 | mtcrf 0x01,r4 // move length remaining to cr7 so we can test bits | |
99 | beq Lmemset5 // fewer than 16 bytes | |
100 | mtctr r0 | |
101 | b Lmemset4 // enter loop | |
102 | ||
103 | .align 5 | |
104 | Lmemset4: // loop over 16-byte chunks | |
105 | stw r7,0(r8) | |
106 | stw r7,4(r8) | |
107 | stw r7,8(r8) | |
108 | stw r7,12(r8) | |
109 | addi r8,r8,16 | |
110 | bdnz++ Lmemset4 | |
111 | ||
112 | // Handle last 0..15 bytes. | |
113 | Lmemset5: | |
114 | bf 28,2f | |
115 | stw r7,0(r8) | |
116 | stw r7,4(r8) | |
117 | addi r8,r8,8 | |
118 | 2: | |
119 | bf 29,3f | |
120 | stw r7,0(r8) | |
121 | addi r8,r8,4 | |
122 | 3: | |
123 | bf 30,4f | |
124 | sth r7,0(r8) | |
125 | addi r8,r8,2 | |
126 | 4: | |
127 | bflr 31 | |
128 | stb r7,0(r8) | |
129 | blr | |
130 | ||
131 | ||
224c7076 A |
132 | /* *********************************** |
133 | * * M E M S E T _ P A T T E R N 1 6 * | |
134 | * *********************************** | |
3d9156a7 A |
135 | * |
136 | * Used to store a 16-byte pattern in memory: | |
137 | * | |
224c7076 | 138 | * void memset_pattern16(void *b, const void *c16, size_t len); |
3d9156a7 A |
139 | * |
140 | * Where c16 points to the 16-byte pattern. None of the parameters need be aligned. | |
141 | */ | |
142 | ||
224c7076 | 143 | .globl _memset_pattern16 |
3d9156a7 | 144 | .align 5 |
224c7076 | 145 | _memset_pattern16: |
3d9156a7 A |
146 | cmplgi cr1,r5,kShort // check length |
147 | lwz r7,0(r4) // load pattern into (these remain lwz in 64-bit mode) | |
148 | lwz r9,4(r4) | |
149 | neg r6,r3 // start to compute ptr alignment | |
150 | lwz r10,8(r4) | |
151 | lwz r11,12(r4) | |
152 | b __memset_pattern_common | |
153 | ||
154 | ||
224c7076 A |
155 | /* ********************************* |
156 | * * M E M S E T _ P A T T E R N 8 * | |
157 | * ********************************* | |
3d9156a7 A |
158 | * |
159 | * Used to store an 8-byte pattern in memory: | |
160 | * | |
224c7076 | 161 | * void memset_pattern8(void *b, const void *c8, size_t len); |
3d9156a7 A |
162 | * |
163 | * Where c8 points to the 8-byte pattern. None of the parameters need be aligned. | |
164 | */ | |
165 | ||
224c7076 | 166 | .globl _memset_pattern8 |
3d9156a7 | 167 | .align 5 |
224c7076 | 168 | _memset_pattern8: |
3d9156a7 A |
169 | lwz r7,0(r4) // load pattern (these remain lwz in 64-bit mode) |
170 | lwz r9,4(r4) | |
171 | cmplgi cr1,r5,kShort // check length | |
172 | neg r6,r3 // start to compute ptr alignment | |
173 | mr r10,r7 // replicate into 16-byte pattern | |
174 | mr r11,r9 | |
175 | b __memset_pattern_common | |
176 | ||
177 | ||
224c7076 A |
178 | /* ********************************* |
179 | * * M E M S E T _ P A T T E R N 4 * | |
180 | * ********************************* | |
3d9156a7 A |
181 | * |
182 | * Used to store a 4-byte pattern in memory: | |
183 | * | |
224c7076 | 184 | * void memset_pattern4(void *b, const void *c4, size_t len); |
3d9156a7 A |
185 | * |
186 | * Where c4 points to the 4-byte pattern. None of the parameters need be aligned. | |
187 | */ | |
188 | ||
224c7076 | 189 | .globl _memset_pattern4 |
3d9156a7 | 190 | .align 5 |
224c7076 | 191 | _memset_pattern4: |
3d9156a7 A |
192 | lwz r7,0(r4) // load pattern |
193 | cmplgi cr1,r5,kShort // check length | |
194 | neg r6,r3 // start to compute ptr alignment | |
195 | mr r9,r7 // replicate into 16-byte pattern | |
196 | mr r10,r7 | |
197 | mr r11,r7 | |
198 | b __memset_pattern_common // don't fall through because of scatter-loading | |
199 | ||
200 | ||
201 | /* *********************************************** | |
202 | * * _ M E M S E T _ P A T T E R N _ C O M M O N * | |
203 | * *********************************************** | |
204 | * | |
224c7076 | 205 | * This is the common code used by _memset_pattern16, 8, and 4. They all get here via |
3d9156a7 A |
206 | * long branch (ie, "b") in case the routines are re-ordered, with: |
207 | * r3 = ptr to memory to store pattern into (unaligned) | |
208 | * r5 = length in bytes | |
209 | * r6 = neg(r3), used to compute #bytes to align | |
210 | * r7, r9, r10, r11 = 16-byte pattern to store | |
211 | * cr1= ble if (r5 <= kShort) | |
212 | */ | |
213 | ||
214 | .globl __memset_pattern_common | |
224c7076 | 215 | .private_extern __memset_pattern_common // avoid dyld stub, which trashes r11 |
3d9156a7 A |
216 | .align 5 |
217 | __memset_pattern_common: | |
218 | andi. r0,r6,0xF // get #bytes to 16-byte align ptr | |
219 | ble-- cr1,LShort // if short operand skip out | |
220 | ||
221 | // Align ptr and store enough of pattern so we have an aligned | |
222 | // 16-byte chunk of it (this effectively rotates incoming pattern | |
223 | // if the original ptr was not aligned.) | |
224 | ||
225 | stw r7,0(r3) | |
226 | stw r9,4(r3) | |
227 | stw r10,8(r3) | |
228 | stw r11,12(r3) | |
229 | beq Laligned // skip if (r0==0), ie if r3 is 16-byte aligned | |
230 | stw r7,16(r3) | |
231 | stw r9,20(r3) | |
232 | stw r10,24(r3) | |
233 | stw r11,28(r3) | |
234 | add r3,r3,r0 // 16-byte align ptr | |
235 | sub r5,r5,r0 // adjust length | |
236 | ||
237 | // We're ready to call the machine-specific commpage routine | |
238 | // to do the heavy lifting. When called, _COMM_PAGE_MEMSET_PATTERN expects: | |
239 | // r4 = length (>= 32) | |
240 | // r8 = ptr (16-byte aligned) | |
241 | // r9 = ptr to 16-byte pattern (16-byte aligned) | |
242 | // When it returns: | |
243 | // r3, r7, and r12 are preserved | |
244 | // r4 and r8 are updated to reflect a residual count of from 0..31 bytes | |
245 | ||
246 | Laligned: | |
247 | mflr r12 // save return across commpage call | |
248 | mr r9,r3 // point to 16-byte aligned 16-byte pattern | |
249 | addi r8,r3,16 // point to first unstored byte (r8 is 16-byte aligned) | |
250 | subi r4,r5,16 // account for the aligned bytes we have stored | |
251 | bla _COMM_PAGE_MEMSET_PATTERN | |
252 | mr. r5,r4 // move length (0..31) back to original reg and test for 0 | |
253 | mtlr r12 | |
254 | beqlr // done if residual length == 0 | |
255 | lwz r7,-16(r8) // load aligned pattern into r7,r9,r10, and r11 | |
256 | lwz r9,-12(r8) | |
257 | mr r3,r8 // move destination ptr back | |
258 | lwz r10,-8(r8) | |
259 | lwz r11,-4(r8) | |
260 | ||
261 | // Handle short operands and leftovers. | |
262 | // r3 = dest | |
263 | // r5 = length | |
264 | // r7,r9,r10,r11 = pattern | |
265 | LShort: | |
266 | srgi. r0,r5,4 // at least 16 bytes? | |
267 | mtcrf 0x01,r5 // move leftover count to cr7 | |
268 | beq Lleftovers | |
269 | mtctr r0 | |
270 | LShortLoop: | |
271 | stw r7,0(r3) // replicate the pattern | |
272 | stw r9,4(r3) | |
273 | stw r10,8(r3) | |
274 | stw r11,12(r3) | |
275 | addi r3,r3,16 | |
276 | bdnz LShortLoop // store 16 more bytes | |
277 | ||
278 | // Fewer than 16 bytes remaining. | |
279 | Lleftovers: | |
280 | bf 28,1f | |
281 | stw r7,0(r3) // store next 8 bytes | |
282 | stw r9,4(r3) | |
283 | addi r3,r3,8 | |
284 | mr r7,r10 // shift pattern over | |
285 | mr r9,r11 | |
286 | 1: | |
287 | bf 29,2f | |
288 | stw r7,0(r3) | |
289 | addi r3,r3,4 | |
290 | mr r7,r9 | |
291 | 2: | |
292 | bf 30,3f | |
293 | rlwinm r7,r7,16,0,31 // position leftmost 2 bytes for store | |
294 | sth r7,0(r3) | |
295 | addi r3,r3,2 | |
296 | 3: | |
297 | bflr 31 | |
298 | srwi r7,r7,24 // position leftmost byte for store | |
299 | stb r7,0(r3) | |
300 | blr |