]>
Commit | Line | Data |
---|---|---|
7b00c0c4 A |
1 | /* |
2 | * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
ad3c9f2a A |
24 | #include <arm/arch.h> |
25 | ||
26 | // Only built for armv6 and higher. | |
27 | #if defined _ARM_ARCH_6 | |
28 | ||
29 | // If we're building for armv7, and not for DYLD, then we have a symbol | |
30 | // resolver so we need to rename these implementations. | |
31 | #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD | |
32 | #define _memset_pattern4 _memset_pattern4$VARIANT$Generic | |
33 | #define _memset_pattern8 _memset_pattern8$VARIANT$Generic | |
34 | #define _memset_pattern16 _memset_pattern16$VARIANT$Generic | |
35 | #endif | |
36 | ||
37 | #include <mach/machine/asm.h> | |
38 | ||
39 | /* | |
40 | * This file contains the following functions: | |
41 | * | |
42 | * void memset_pattern4(void *b, const void *c4, size_t len) | |
43 | * void memset_pattern8(void *b, const void *c8, size_t len) | |
44 | * void memset_pattern16(void *b, const void *c16, size_t len) | |
45 | * | |
46 | * The memset() is implemented in the bzero.s file. | |
47 | * | |
48 | * This is a reasonably well optimized version of memset_pattern* routines | |
49 | * implemented for the ARM9 and ARM11 processors using the ARMv6 instruction | |
50 | * set. These routines use the ARM's core registers. | |
51 | * | |
52 | * The algorithm is to align the destination pointer on a 16 byte boundary | |
53 | * and then blast data 64 bytes at a time, in two stores of 32 bytes per loop. | |
54 | * | |
55 | */ | |
56 | .text | |
57 | .align 2 | |
58 | .syntax unified | |
59 | ||
60 | /*----------------------------------------------------------------------------*/ | |
61 | /* void memset_pattern4(void *ptr, const void *pattern4, size_t len); */ | |
62 | /* */ | |
63 | /* r0 << destination pointer */ | |
64 | /* r1 << pointer to 4-byte pattern */ | |
65 | /* r2 << 'len' (length of destination buffer in bytes) */ | |
66 | /*----------------------------------------------------------------------------*/ | |
67 | .globl _memset_pattern4 | |
68 | _memset_pattern4: | |
69 | cmp r2, #0 /* check if len is zero */ | |
70 | bxeq lr /* return if length is zero */ | |
71 | ||
72 | /* We need some registers, so save volatiles on stack */ | |
73 | /* Avoid r7 (frame pointer) and r9 (thread register) */ | |
74 | stmfd sp!, {r4-r7, lr} | |
75 | add r7, sp, #12 /* establish frame */ | |
76 | stmfd sp!, {r8, r10-r11} | |
77 | ||
78 | /* copy destination base pointer r0 to r12 and leave r0 alone */ | |
79 | /* so that we return original pointer back to the caller */ | |
80 | mov r12, r0 | |
81 | ||
82 | /* Check if 'len' is long enough to bother alignment of destination */ | |
83 | /* pointer */ | |
84 | cmp r2, #32 /* long enough to bother aligning? */ | |
85 | movlt r3, #4 /* move pattern length into r3 */ | |
86 | movlt r10, #4 /* pattern index */ | |
87 | movlt r11, r1 /* move pattern pointer into r11 */ | |
88 | blt L_Short /* no */ | |
89 | ||
90 | /* move 'len' into r1, get 4-byte pattern in r2 */ | |
91 | mov r6, r2 /* temporarily move 'len' in to r6 */ | |
92 | ldr r2, [r1]/* load 4-byte pattern into r2 */ | |
93 | mov r1, r6 /* move 'len' from r6 to r1 */ | |
94 | ||
95 | mov r3, r2 /* copy 4-byte pattern into r3, r4 and r5 registers */ | |
96 | mov r4, r2 | |
97 | mov r5, r2 | |
98 | ||
99 | L_NotShort: | |
100 | ||
101 | /* Check for 16 or 32 byte aligned destination pointer */ | |
102 | tst r12, #0x1F /* check for 32 byte aligned */ | |
103 | beq L_Aligned | |
104 | tst r12, #0xF /* check for 16 byte aligned */ | |
105 | beq L_16ByteAligned | |
106 | b L_Unaligned /* yes */ | |
107 | ||
108 | L_Bytewise: | |
109 | ldrb r4, [r11], #1 | |
110 | strb r4, [r12], #1 | |
111 | subs r10, #1 | |
112 | moveq r10, r3 | |
113 | moveq r11, r1 | |
114 | sub r2, #1 | |
115 | ||
116 | L_Short: | |
117 | cmp r2, #0 /* more bytes left? */ | |
118 | bne L_Bytewise | |
119 | ldm sp!, {r8, r10-r11} /* restores registers from stack */ | |
120 | ldm sp!, {r4-r7, pc} /* restore & return from subroutine */ | |
121 | ||
122 | /* 'len' is long enough to justify aligning the destination pointer */ | |
123 | /* */ | |
124 | /* By the time we reach here, data is stored in registers as follows: */ | |
125 | /* r1 << 'len' (length of destination buffer in bytes) */ | |
126 | /* r2-r5 << pattern; either 4x4byte OR 2x8byte OR 1x16-byte */ | |
127 | /* r12 << destination pointer copy (scratch register) */ | |
128 | /* r0 << destination pointer original */ | |
129 | /* */ | |
130 | /* Use r11 as scratch register to store the #bytes offset to 16-byte align */ | |
131 | /* */ | |
132 | /* Unaligned on 32-byte boundary, store 1-15 bytes until 16-byte aligned */ | |
133 | /* As we store these bytes, we rotate the pattern stored in r2-r5 to reflect */ | |
134 | /* the alignment. */ | |
135 | ||
136 | L_Unaligned: | |
137 | mov r11, r12, lsl #28 | |
138 | rsb r11, r11, #0 | |
139 | msr cpsr_f, r11 /* Bits[31:28] of cpsr now contain #bytes to align*/ | |
140 | ||
141 | L_Store15BytesAndRotatePattern: | |
142 | strbvs r2, [r12], #1 /* v is set, unaligned in the 1s column */ | |
143 | andvs r6, r2, #0xFF /* Rotate pattern right in r2-r5 by 1-byte */ | |
144 | andvs r8, r3, #0xFF /* Consider register r2-r5 and a contiguous */ | |
145 | andvs r10, r4, #0xFF /* 16-byte register with r2 containing LSB */ | |
146 | andvs r11, r5, #0xFF /* and r5 containing MSB */ | |
147 | lsrvs r2, r2, #8 | |
148 | lsrvs r3, r3, #8 | |
149 | lsrvs r4, r4, #8 | |
150 | lsrvs r5, r5, #8 | |
151 | orrvs r2, r2, r8, lsl #24 | |
152 | orrvs r3, r3, r10, lsl #24 | |
153 | orrvs r4, r4, r11, lsl #24 | |
154 | orrvs r5, r5, r6, lsl #24 | |
155 | ||
156 | strhcs r2, [r12], #2 /* c is set, unaligned in the 2s column */ | |
157 | movcs r6, r2, lsl #16 /* Rotate pattern right in r2-r5 by 2-bytes */ | |
158 | movcs r8, r3, lsl #16 | |
159 | movcs r10, r4, lsl #16 | |
160 | movcs r11, r5, lsl #16 | |
161 | lsrcs r2, r2, #16 | |
162 | lsrcs r3, r3, #16 | |
163 | lsrcs r4, r4, #16 | |
164 | lsrcs r5, r5, #16 | |
165 | orrcs r2, r2, r8 | |
166 | orrcs r3, r3, r10 | |
167 | orrcs r4, r4, r11 | |
168 | orrcs r5, r5, r6 | |
169 | ||
170 | streq r2, [r12], #4 /* z is set, unaligned in the 4s column */ | |
171 | moveq r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */ | |
172 | moveq r2, r3 | |
173 | moveq r3, r4 | |
174 | moveq r4, r5 | |
175 | moveq r5, r6 | |
176 | ||
177 | stmmi r12!, {r2-r3} /* n is set, unaligned in the 8s column */ | |
178 | movmi r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */ | |
179 | movmi r8, r3 | |
180 | movmi r2, r4 | |
181 | movmi r3, r5 | |
182 | movmi r4, r6 | |
183 | movmi r5, r8 | |
184 | ||
185 | mrs r11, cpsr /*copy cpsr in to r11 */ | |
186 | subs r1, r1, r11, lsr #28 | |
187 | ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ | |
188 | ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */ | |
189 | ||
190 | /* By the time we reach here, we are 16-byte aligned and r2-r5 contains */ | |
191 | /* rotated pattern. Now lets make sure we are 32-byte aligned. */ | |
192 | L_16ByteAligned: | |
193 | tst r12, #(1 << 4) | |
194 | stmne r12!, {r2-r5} | |
195 | subsne r1, r1, #16 | |
196 | ||
197 | /* By the time we reach here, data is stored in registers as follows: */ | |
198 | /* r1 << 'len' (remaining length of destination buffer in bytes) */ | |
199 | /* r2-r5 << rotated pattern; either 4x4byte OR 2x8byte OR 1x16-byte */ | |
200 | /* r12 << aligned destination pointer copy (scratch register) */ | |
201 | L_Aligned: | |
202 | cmp r1, #64 | |
203 | blt L_AlignedLessThan64 | |
204 | ||
205 | /* Copy pattern in four more registers so that we can do 64 byte transfers */ | |
206 | mov r6, r2 | |
207 | mov r8, r3 | |
208 | mov r10, r4 | |
209 | mov r11, r5 | |
210 | ||
211 | /* At this point, we are 16-byte aligned and 'len' is greater than 64 bytes */ | |
212 | /* Lets transfer 64 bytes at a time until len becomes less than 64 bytes */ | |
213 | sub r1, r1, #64 /* pre-subtract to avoid extra compare in loop */ | |
214 | L_Loop64: | |
215 | stm r12!, {r2-r6, r8, r10-r11} | |
216 | subs r1, r1, #64 | |
217 | stm r12!, {r2-r6, r8, r10-r11} | |
218 | bge L_Loop64 | |
219 | ||
220 | /* return if 'len' is zero */ | |
221 | adds r1, r1, #64 /* readjust length; previously subtracted extra 64*/ | |
222 | ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ | |
223 | ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */ | |
224 | ||
225 | L_AlignedLessThan64: | |
226 | /* do we have 16 or more bytes left */ | |
227 | cmp r1, #16 | |
228 | stmge r12!, {r2-r5} | |
229 | subsge r1, r1, #16 | |
230 | bgt L_AlignedLessThan64 | |
231 | ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ | |
232 | ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */ | |
233 | ||
234 | L_AlignedLessThan16: | |
235 | /* store last up-to 15 bytes */ | |
236 | /* move the remaining len bits [3:0] to the flags area of cpsr */ | |
237 | mov r1, r1, lsl #28 | |
238 | msr cpsr_f, r1 | |
239 | ||
240 | stmmi r12!, {r2-r3} /* n is set, store 8 bytes */ | |
241 | movmi r2, r4 /* shift vector down 8 bytes */ | |
242 | movmi r3, r5 | |
243 | ||
244 | streq r2, [r12], #4 /* z is set, store 4 bytes */ | |
245 | moveq r2, r3 /* shift vector down 4 bytes */ | |
246 | ||
247 | strhcs r2, [r12], #2 /* c is set, store 2 bytes */ | |
248 | lsrcs r2, #16 /* shift register right 2 bytes */ | |
249 | ||
250 | strbvs r2, [r12], #1 /* v is set, store 1 byte */ | |
251 | ldm sp!, {r8, r10-r11} /* restores registers from stack */ | |
252 | ldm sp!, {r4-r7, pc} /* restore & return from subroutine */ | |
253 | ||
254 | /*----------------------------------------------------------------------------*/ | |
255 | /* void memset_pattern8(void *ptr, const void *pattern8, size_t len); */ | |
256 | /* */ | |
257 | /* r0 << destination pointer */ | |
258 | /* r1 << pointer to 8-byte pattern */ | |
259 | /* r2 << 'len' (length of destination buffer in bytes) */ | |
260 | /*----------------------------------------------------------------------------*/ | |
261 | .globl _memset_pattern8 | |
262 | _memset_pattern8: | |
263 | cmp r2, #0 /* check if len is zero */ | |
264 | bxeq lr /* return if length is zero */ | |
265 | ||
266 | /* We need some registers, so save volatiles on stack */ | |
267 | /* Avoid r7 (frame pointer) and r9 (thread register) */ | |
268 | stmfd sp!, {r4-r7, lr} | |
269 | add r7, sp, #12 /* establish frame */ | |
270 | stmfd sp!, {r8, r10-r11} | |
271 | ||
272 | /* copy destination base pointer r0 to r12 and leave r0 alone */ | |
273 | /* so that we return original pointer back to the caller */ | |
274 | mov r12, r0 | |
275 | ||
276 | /* Check if 'len' is long enough to bother alignment of destination */ | |
277 | /* pointer */ | |
278 | cmp r2, #32 /* long enough to bother aligning? */ | |
279 | movlt r3, #8 /* move pattern length into r3 */ | |
280 | movlt r10, #8 /* pattern index */ | |
281 | movlt r11, r1 /* move pattern pointer into r11 */ | |
282 | blt L_Short /* no */ | |
283 | ||
284 | /* move 'len' into r1, get 8-byte pattern in r2-r3 */ | |
285 | mov r6, r2 /* temporarily move 'len' in to r6 */ | |
286 | ldr r2, [r1], #4 /* load 8-byte pattern into r2-r3 */ | |
287 | ldr r3, [r1], #4 | |
288 | mov r1, r6 /* move 'len' from r6 to r1 */ | |
289 | ||
290 | mov r4, r2 /* copy 8-byte pattern into r4-r5 registers */ | |
291 | mov r5, r3 | |
292 | b L_NotShort /* yes */ | |
293 | ||
294 | ||
295 | /*----------------------------------------------------------------------------*/ | |
296 | /* void memset_pattern16(void *ptr, const void *pattern16, size_t len); */ | |
297 | /* */ | |
298 | /* r0 << destination pointer */ | |
299 | /* r1 << pointer to 16-byte pattern */ | |
300 | /* r2 << 'len' (length of destination buffer in bytes) */ | |
301 | /*----------------------------------------------------------------------------*/ | |
302 | .globl _memset_pattern16 | |
303 | _memset_pattern16: | |
304 | cmp r2, #0 /* check if len is zero */ | |
305 | bxeq lr /* return if length is zero */ | |
306 | ||
307 | /* We need some registers, so save volatiles on stack */ | |
308 | /* Avoid r7 (frame pointer) and r9 (thread register) */ | |
309 | stmfd sp!, {r4-r7, lr} | |
310 | add r7, sp, #12 /* establish frame */ | |
311 | stmfd sp!, {r8, r10-r11} | |
312 | ||
313 | /* copy destination base pointer r0 to r12 and leave r0 alone */ | |
314 | /* so that we return original pointer back to the caller */ | |
315 | mov r12, r0 | |
316 | ||
317 | /* Check if 'len' is long enough to bother alignment of destination */ | |
318 | /* pointer */ | |
319 | cmp r2, #32 /* long enough to bother aligning? */ | |
320 | movlt r3, #16 /* move pattern length into r3 */ | |
321 | movlt r10, #16 /* pattern index */ | |
322 | movlt r11, r1 /* move pattern pointer into r11 */ | |
323 | blt L_Short /* no */ | |
324 | ||
325 | /* move 'len' into r1, get 16-byte pattern in r2-r5 */ | |
326 | mov r6, r2 /* temporarily move 'len' in to r6 */ | |
327 | ldr r2, [r1], #4 /* load 16-byte pattern into r2-r5 */ | |
328 | ldr r3, [r1], #4 | |
329 | ldr r4, [r1], #4 | |
330 | ldr r5, [r1], #4 | |
331 | mov r1, r6 /* move 'len' from r6 to r1 */ | |
332 | ||
333 | b L_NotShort /* yes */ | |
334 | ||
335 | ||
336 | #endif /* _ARM_ARCH_6 */ |