]>
Commit | Line | Data |
---|---|---|
34e8f829 A |
1 | /* |
2 | * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | #include <arm/arch.h> | |
25 | ||
26 | #if defined(_ARM_ARCH_6) | |
27 | ||
28 | #include <mach/machine/asm.h> | |
29 | ||
30 | /* | |
31 | * This file contains the following functions: | |
32 | * | |
33 | * void memset_pattern4(void *b, const void *c4, size_t len) | |
34 | * void memset_pattern8(void *b, const void *c8, size_t len) | |
35 | * void memset_pattern16(void *b, const void *c16, size_t len) | |
36 | * | |
37 | * The memset() is implemented in the bzero.s file. | |
38 | * | |
39 | * This is a reasonably well optimized version of memset_pattern* routines | |
40 | * implemented for the ARM9 and ARM11 processors using the ARMv6 instruction | |
41 | * set. These routines use the ARM's core registers. | |
42 | * | |
43 | * The algorithm is to align the destination pointer on a 16 byte boundary | |
44 | * and then blast data 64 bytes at a time, in two stores of 32 bytes per loop. | |
45 | * | |
46 | */ | |
47 | .text | |
48 | .align 2 | |
49 | .syntax unified | |
50 | ||
51 | /*----------------------------------------------------------------------------*/ | |
52 | /* void memset_pattern4(void *ptr, const void *pattern4, size_t len); */ | |
53 | /* */ | |
54 | /* r0 << destination pointer */ | |
55 | /* r1 << pointer to 4-byte pattern */ | |
56 | /* r2 << 'len' (length of destination buffer in bytes) */ | |
57 | /*----------------------------------------------------------------------------*/ | |
58 | .globl _memset_pattern4 | |
59 | _memset_pattern4: | |
60 | cmp r2, #0 /* check if len is zero */ | |
61 | bxeq lr /* return if length is zero */ | |
62 | ||
63 | /* We need some registers, so save volatiles on stack */ | |
64 | /* Avoid r7 (frame pointer) and r9 (thread register) */ | |
65 | stmfd sp!, {r4-r7, lr} | |
66 | add r7, sp, #12 /* establish frame */ | |
67 | stmfd sp!, {r8, r10-r11} | |
68 | ||
69 | /* copy destination base pointer r0 to r12 and leave r0 alone */ | |
70 | /* so that we return original pointer back to the caller */ | |
71 | mov r12, r0 | |
72 | ||
73 | /* Check if 'len' is long enough to bother alignment of destination */ | |
74 | /* pointer */ | |
75 | cmp r2, #32 /* long enough to bother aligning? */ | |
76 | movlt r3, #4 /* move pattern length into r3 */ | |
77 | movlt r10, #4 /* pattern index */ | |
78 | movlt r11, r1 /* move pattern pointer into r11 */ | |
79 | blt L_Short /* no */ | |
80 | ||
81 | /* move 'len' into r1, get 4-byte pattern in r2 */ | |
82 | mov r6, r2 /* temporarily move 'len' in to r6 */ | |
83 | ldr r2, [r1]/* load 4-byte pattern into r2 */ | |
84 | mov r1, r6 /* move 'len' from r6 to r1 */ | |
85 | ||
86 | mov r3, r2 /* copy 4-byte pattern into r3, r4 and r5 registers */ | |
87 | mov r4, r2 | |
88 | mov r5, r2 | |
89 | ||
90 | L_NotShort: | |
91 | ||
92 | /* Check for 16 or 32 byte aligned destination pointer */ | |
93 | tst r12, #0x1F /* check for 32 byte aligned */ | |
94 | beq L_Aligned | |
95 | tst r12, #0xF /* check for 16 byte aligned */ | |
96 | beq L_16ByteAligned | |
97 | b L_Unaligned /* yes */ | |
98 | ||
99 | L_Bytewise: | |
100 | ldrb r4, [r11], #1 | |
101 | strb r4, [r12], #1 | |
102 | subs r10, #1 | |
103 | moveq r10, r3 | |
104 | moveq r11, r1 | |
105 | sub r2, #1 | |
106 | ||
107 | L_Short: | |
108 | cmp r2, #0 /* more bytes left? */ | |
109 | bne L_Bytewise | |
110 | ldm sp!, {r8, r10-r11} /* restores registers from stack */ | |
111 | ldm sp!, {r4-r7, pc} /* restore & return from subroutine */ | |
112 | ||
113 | /* 'len' is long enough to justify aligning the destination pointer */ | |
114 | /* */ | |
115 | /* By the time we reach here, data is stored in registers as follows: */ | |
116 | /* r1 << 'len' (length of destination buffer in bytes) */ | |
117 | /* r2-r5 << pattern; either 4x4byte OR 2x8byte OR 1x16-byte */ | |
118 | /* r12 << destination pointer copy (scratch register) */ | |
119 | /* r0 << destination pointer original */ | |
120 | /* */ | |
121 | /* Use r11 as scratch register to store the #bytes offset to 16-byte align */ | |
122 | /* */ | |
123 | /* Unaligned on 32-byte boundary, store 1-15 bytes until 16-byte aligned */ | |
124 | /* As we store these bytes, we rotate the pattern stored in r2-r5 to reflect */ | |
125 | /* the alignment. */ | |
126 | ||
127 | L_Unaligned: | |
128 | mov r11, r12, lsl #28 | |
129 | rsb r11, r11, #0 | |
130 | msr cpsr_f, r11 /* Bits[31:28] of cpsr now contain #bytes to align*/ | |
131 | ||
132 | L_Store15BytesAndRotatePattern: | |
133 | strbvs r2, [r12], #1 /* v is set, unaligned in the 1s column */ | |
134 | andvs r6, r2, #0xFF /* Rotate pattern right in r2-r5 by 1-byte */ | |
135 | andvs r8, r3, #0xFF /* Consider register r2-r5 and a contiguous */ | |
136 | andvs r10, r4, #0xFF /* 16-byte register with r2 containing LSB */ | |
137 | andvs r11, r5, #0xFF /* and r5 containing MSB */ | |
138 | lsrvs r2, r2, #8 | |
139 | lsrvs r3, r3, #8 | |
140 | lsrvs r4, r4, #8 | |
141 | lsrvs r5, r5, #8 | |
142 | orrvs r2, r2, r8, lsl #24 | |
143 | orrvs r3, r3, r10, lsl #24 | |
144 | orrvs r4, r4, r11, lsl #24 | |
145 | orrvs r5, r5, r6, lsl #24 | |
146 | ||
147 | strhcs r2, [r12], #2 /* c is set, unaligned in the 2s column */ | |
148 | movcs r6, r2, lsl #16 /* Rotate pattern right in r2-r5 by 2-bytes */ | |
149 | movcs r8, r3, lsl #16 | |
150 | movcs r10, r4, lsl #16 | |
151 | movcs r11, r5, lsl #16 | |
152 | lsrcs r2, r2, #16 | |
153 | lsrcs r3, r3, #16 | |
154 | lsrcs r4, r4, #16 | |
155 | lsrcs r5, r5, #16 | |
156 | orrcs r2, r2, r8 | |
157 | orrcs r3, r3, r10 | |
158 | orrcs r4, r4, r11 | |
159 | orrcs r5, r5, r6 | |
160 | ||
161 | streq r2, [r12], #4 /* z is set, unaligned in the 4s column */ | |
162 | moveq r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */ | |
163 | moveq r2, r3 | |
164 | moveq r3, r4 | |
165 | moveq r4, r5 | |
166 | moveq r5, r6 | |
167 | ||
168 | stmmi r12!, {r2-r3} /* n is set, unaligned in the 8s column */ | |
169 | movmi r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */ | |
170 | movmi r8, r3 | |
171 | movmi r2, r4 | |
172 | movmi r3, r5 | |
173 | movmi r4, r6 | |
174 | movmi r5, r8 | |
175 | ||
176 | mrs r11, cpsr /*copy cpsr in to r11 */ | |
177 | subs r1, r1, r11, lsr #28 | |
178 | ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ | |
179 | ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */ | |
180 | ||
181 | /* By the time we reach here, we are 16-byte aligned and r2-r5 contains */ | |
182 | /* rotated pattern. Now lets make sure we are 32-byte aligned. */ | |
183 | L_16ByteAligned: | |
184 | tst r12, #(1 << 4) | |
185 | stmne r12!, {r2-r5} | |
186 | subsne r1, r1, #16 | |
187 | ||
188 | /* By the time we reach here, data is stored in registers as follows: */ | |
189 | /* r1 << 'len' (remaining length of destination buffer in bytes) */ | |
190 | /* r2-r5 << rotated pattern; either 4x4byte OR 2x8byte OR 1x16-byte */ | |
191 | /* r12 << aligned destination pointer copy (scratch register) */ | |
192 | L_Aligned: | |
193 | cmp r1, #64 | |
194 | blt L_AlignedLessThan64 | |
195 | ||
196 | /* Copy pattern in four more registers so that we can do 64 byte transfers */ | |
197 | mov r6, r2 | |
198 | mov r8, r3 | |
199 | mov r10, r4 | |
200 | mov r11, r5 | |
201 | ||
202 | /* At this point, we are 16-byte aligned and 'len' is greater than 64 bytes */ | |
203 | /* Lets transfer 64 bytes at a time until len becomes less than 64 bytes */ | |
204 | sub r1, r1, #64 /* pre-subtract to avoid extra compare in loop */ | |
205 | L_Loop64: | |
206 | stm r12!, {r2-r6, r8, r10-r11} | |
207 | subs r1, r1, #64 | |
208 | stm r12!, {r2-r6, r8, r10-r11} | |
209 | bge L_Loop64 | |
210 | ||
211 | /* return if 'len' is zero */ | |
212 | adds r1, r1, #64 /* readjust length; previously subtracted extra 64*/ | |
213 | ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ | |
214 | ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */ | |
215 | ||
216 | L_AlignedLessThan64: | |
217 | /* do we have 16 or more bytes left */ | |
218 | cmp r1, #16 | |
219 | stmge r12!, {r2-r5} | |
220 | subsge r1, r1, #16 | |
221 | bgt L_AlignedLessThan64 | |
222 | ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ | |
223 | ldmeq sp!, {r4-r7, pc} /* restore & return from subroutine */ | |
224 | ||
225 | L_AlignedLessThan16: | |
226 | /* store last up-to 15 bytes */ | |
227 | /* move the remaining len bits [3:0] to the flags area of cpsr */ | |
228 | mov r1, r1, lsl #28 | |
229 | msr cpsr_f, r1 | |
230 | ||
231 | stmmi r12!, {r2-r3} /* n is set, store 8 bytes */ | |
232 | movmi r2, r4 /* shift vector down 8 bytes */ | |
233 | movmi r3, r5 | |
234 | ||
235 | streq r2, [r12], #4 /* z is set, store 4 bytes */ | |
236 | moveq r2, r3 /* shift vector down 4 bytes */ | |
237 | ||
238 | strhcs r2, [r12], #2 /* c is set, store 2 bytes */ | |
239 | lsrcs r2, #16 /* shift register right 2 bytes */ | |
240 | ||
241 | strbvs r2, [r12], #1 /* v is set, store 1 byte */ | |
242 | ldm sp!, {r8, r10-r11} /* restores registers from stack */ | |
243 | ldm sp!, {r4-r7, pc} /* restore & return from subroutine */ | |
244 | ||
245 | /*----------------------------------------------------------------------------*/ | |
246 | /* void memset_pattern8(void *ptr, const void *pattern8, size_t len); */ | |
247 | /* */ | |
248 | /* r0 << destination pointer */ | |
249 | /* r1 << pointer to 8-byte pattern */ | |
250 | /* r2 << 'len' (length of destination buffer in bytes) */ | |
251 | /*----------------------------------------------------------------------------*/ | |
252 | .globl _memset_pattern8 | |
253 | _memset_pattern8: | |
254 | cmp r2, #0 /* check if len is zero */ | |
255 | bxeq lr /* return if length is zero */ | |
256 | ||
257 | /* We need some registers, so save volatiles on stack */ | |
258 | /* Avoid r7 (frame pointer) and r9 (thread register) */ | |
259 | stmfd sp!, {r4-r7, lr} | |
260 | add r7, sp, #12 /* establish frame */ | |
261 | stmfd sp!, {r8, r10-r11} | |
262 | ||
263 | /* copy destination base pointer r0 to r12 and leave r0 alone */ | |
264 | /* so that we return original pointer back to the caller */ | |
265 | mov r12, r0 | |
266 | ||
267 | /* Check if 'len' is long enough to bother alignment of destination */ | |
268 | /* pointer */ | |
269 | cmp r2, #32 /* long enough to bother aligning? */ | |
270 | movlt r3, #8 /* move pattern length into r3 */ | |
271 | movlt r10, #8 /* pattern index */ | |
272 | movlt r11, r1 /* move pattern pointer into r11 */ | |
273 | blt L_Short /* no */ | |
274 | ||
275 | /* move 'len' into r1, get 8-byte pattern in r2-r3 */ | |
276 | mov r6, r2 /* temporarily move 'len' in to r6 */ | |
277 | ldr r2, [r1], #4 /* load 8-byte pattern into r2-r3 */ | |
278 | ldr r3, [r1], #4 | |
279 | mov r1, r6 /* move 'len' from r6 to r1 */ | |
280 | ||
281 | mov r4, r2 /* copy 8-byte pattern into r4-r5 registers */ | |
282 | mov r5, r3 | |
283 | b L_NotShort /* yes */ | |
284 | ||
285 | ||
286 | /*----------------------------------------------------------------------------*/ | |
287 | /* void memset_pattern16(void *ptr, const void *pattern16, size_t len); */ | |
288 | /* */ | |
289 | /* r0 << destination pointer */ | |
290 | /* r1 << pointer to 16-byte pattern */ | |
291 | /* r2 << 'len' (length of destination buffer in bytes) */ | |
292 | /*----------------------------------------------------------------------------*/ | |
293 | .globl _memset_pattern16 | |
294 | _memset_pattern16: | |
295 | cmp r2, #0 /* check if len is zero */ | |
296 | bxeq lr /* return if length is zero */ | |
297 | ||
298 | /* We need some registers, so save volatiles on stack */ | |
299 | /* Avoid r7 (frame pointer) and r9 (thread register) */ | |
300 | stmfd sp!, {r4-r7, lr} | |
301 | add r7, sp, #12 /* establish frame */ | |
302 | stmfd sp!, {r8, r10-r11} | |
303 | ||
304 | /* copy destination base pointer r0 to r12 and leave r0 alone */ | |
305 | /* so that we return original pointer back to the caller */ | |
306 | mov r12, r0 | |
307 | ||
308 | /* Check if 'len' is long enough to bother alignment of destination */ | |
309 | /* pointer */ | |
310 | cmp r2, #32 /* long enough to bother aligning? */ | |
311 | movlt r3, #16 /* move pattern length into r3 */ | |
312 | movlt r10, #16 /* pattern index */ | |
313 | movlt r11, r1 /* move pattern pointer into r11 */ | |
314 | blt L_Short /* no */ | |
315 | ||
316 | /* move 'len' into r1, get 16-byte pattern in r2-r5 */ | |
317 | mov r6, r2 /* temporarily move 'len' in to r6 */ | |
318 | ldr r2, [r1], #4 /* load 16-byte pattern into r2-r5 */ | |
319 | ldr r3, [r1], #4 | |
320 | ldr r4, [r1], #4 | |
321 | ldr r5, [r1], #4 | |
322 | mov r1, r6 /* move 'len' from r6 to r1 */ | |
323 | ||
324 | b L_NotShort /* yes */ | |
325 | ||
326 | ||
327 | #endif /* _ARM_ARCH_6 */ |