]>
Commit | Line | Data |
---|---|---|
34e8f829 A |
1 | /* |
2 | * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | #include <arm/arch.h> | |
34e8f829 A |
25 | #include <mach/machine/asm.h> |
26 | ||
27 | /* | |
28 | * This file contains the following functions: | |
29 | * | |
30 | * void memset_pattern4(void *b, const void *c4, size_t len) | |
31 | * void memset_pattern8(void *b, const void *c8, size_t len) | |
32 | * void memset_pattern16(void *b, const void *c16, size_t len) | |
33 | * | |
34 | * The memset() is implemented in the bzero.s file. | |
35 | * | |
36 | * This is a reasonably well optimized version of memset_pattern* routines | |
511daa4c A |
37 | * implemented for ARM processors using the ARMv4 and later instruction sets. |
38 | * These routines use the ARM's core registers. | |
34e8f829 A |
39 | * |
40 | * The algorithm is to align the destination pointer on a 16 byte boundary | |
41 | * and then blast data 64 bytes at a time, in two stores of 32 bytes per loop. | |
42 | * | |
43 | */ | |
44 | .text | |
45 | .align 2 | |
46 | .syntax unified | |
47 | ||
48 | /*----------------------------------------------------------------------------*/ | |
49 | /* void memset_pattern4(void *ptr, const void *pattern4, size_t len); */ | |
50 | /* */ | |
51 | /* r0 << destination pointer */ | |
52 | /* r1 << pointer to 4-byte pattern */ | |
53 | /* r2 << 'len' (length of destination buffer in bytes) */ | |
54 | /*----------------------------------------------------------------------------*/ | |
55 | .globl _memset_pattern4 | |
56 | _memset_pattern4: | |
57 | cmp r2, #0 /* check if len is zero */ | |
58 | bxeq lr /* return if length is zero */ | |
59 | ||
60 | /* We need some registers, so save volatiles on stack */ | |
61 | /* Avoid r7 (frame pointer) and r9 (thread register) */ | |
62 | stmfd sp!, {r4-r7, lr} | |
63 | add r7, sp, #12 /* establish frame */ | |
64 | stmfd sp!, {r8, r10-r11} | |
65 | ||
66 | /* copy destination base pointer r0 to r12 and leave r0 alone */ | |
67 | /* so that we return original pointer back to the caller */ | |
68 | mov r12, r0 | |
69 | ||
70 | /* Check if 'len' is long enough to bother alignment of destination */ | |
71 | /* pointer */ | |
72 | cmp r2, #32 /* long enough to bother aligning? */ | |
73 | movlt r3, #4 /* move pattern length into r3 */ | |
74 | movlt r10, #4 /* pattern index */ | |
75 | movlt r11, r1 /* move pattern pointer into r11 */ | |
76 | blt L_Short /* no */ | |
77 | ||
78 | /* move 'len' into r1, get 4-byte pattern in r2 */ | |
79 | mov r6, r2 /* temporarily move 'len' in to r6 */ | |
511daa4c A |
80 | bl L_GetPatternWord /* get unaligned pattern word in r5 */ |
81 | mov r2, r5 /* move pattern word into r2 */ | |
82 | mov r0, r12 /* r0 was clobbered - restore it */ | |
34e8f829 A |
83 | mov r1, r6 /* move 'len' from r6 to r1 */ |
84 | ||
85 | mov r3, r2 /* copy 4-byte pattern into r3, r4 and r5 registers */ | |
86 | mov r4, r2 | |
87 | mov r5, r2 | |
88 | ||
89 | L_NotShort: | |
90 | ||
91 | /* Check for 16 or 32 byte aligned destination pointer */ | |
92 | tst r12, #0x1F /* check for 32 byte aligned */ | |
93 | beq L_Aligned | |
94 | tst r12, #0xF /* check for 16 byte aligned */ | |
95 | beq L_16ByteAligned | |
96 | b L_Unaligned /* yes */ | |
97 | ||
98 | L_Bytewise: | |
99 | ldrb r4, [r11], #1 | |
100 | strb r4, [r12], #1 | |
101 | subs r10, #1 | |
102 | moveq r10, r3 | |
103 | moveq r11, r1 | |
104 | sub r2, #1 | |
105 | ||
106 | L_Short: | |
107 | cmp r2, #0 /* more bytes left? */ | |
108 | bne L_Bytewise | |
109 | ldm sp!, {r8, r10-r11} /* restores registers from stack */ | |
511daa4c A |
110 | ldm sp!, {r4-r7, lr} /* restore & return from subroutine */ |
111 | bx lr | |
34e8f829 A |
112 | |
113 | /* 'len' is long enough to justify aligning the destination pointer */ | |
114 | /* */ | |
115 | /* By the time we reach here, data is stored in registers as follows: */ | |
116 | /* r1 << 'len' (length of destination buffer in bytes) */ | |
117 | /* r2-r5 << pattern; either 4x4byte OR 2x8byte OR 1x16-byte */ | |
118 | /* r12 << destination pointer copy (scratch register) */ | |
119 | /* r0 << destination pointer original */ | |
120 | /* */ | |
121 | /* Use r11 as scratch register to store the #bytes offset to 16-byte align */ | |
122 | /* */ | |
123 | /* Unaligned on 32-byte boundary, store 1-15 bytes until 16-byte aligned */ | |
124 | /* As we store these bytes, we rotate the pattern stored in r2-r5 to reflect */ | |
125 | /* the alignment. */ | |
126 | ||
127 | L_Unaligned: | |
128 | mov r11, r12, lsl #28 | |
129 | rsb r11, r11, #0 | |
130 | msr cpsr_f, r11 /* Bits[31:28] of cpsr now contain #bytes to align*/ | |
131 | ||
132 | L_Store15BytesAndRotatePattern: | |
133 | strbvs r2, [r12], #1 /* v is set, unaligned in the 1s column */ | |
134 | andvs r6, r2, #0xFF /* Rotate pattern right in r2-r5 by 1-byte */ | |
135 | andvs r8, r3, #0xFF /* Consider register r2-r5 and a contiguous */ | |
136 | andvs r10, r4, #0xFF /* 16-byte register with r2 containing LSB */ | |
137 | andvs r11, r5, #0xFF /* and r5 containing MSB */ | |
138 | lsrvs r2, r2, #8 | |
139 | lsrvs r3, r3, #8 | |
140 | lsrvs r4, r4, #8 | |
141 | lsrvs r5, r5, #8 | |
142 | orrvs r2, r2, r8, lsl #24 | |
143 | orrvs r3, r3, r10, lsl #24 | |
144 | orrvs r4, r4, r11, lsl #24 | |
145 | orrvs r5, r5, r6, lsl #24 | |
146 | ||
147 | strhcs r2, [r12], #2 /* c is set, unaligned in the 2s column */ | |
148 | movcs r6, r2, lsl #16 /* Rotate pattern right in r2-r5 by 2-bytes */ | |
149 | movcs r8, r3, lsl #16 | |
150 | movcs r10, r4, lsl #16 | |
151 | movcs r11, r5, lsl #16 | |
152 | lsrcs r2, r2, #16 | |
153 | lsrcs r3, r3, #16 | |
154 | lsrcs r4, r4, #16 | |
155 | lsrcs r5, r5, #16 | |
156 | orrcs r2, r2, r8 | |
157 | orrcs r3, r3, r10 | |
158 | orrcs r4, r4, r11 | |
159 | orrcs r5, r5, r6 | |
160 | ||
161 | streq r2, [r12], #4 /* z is set, unaligned in the 4s column */ | |
162 | moveq r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */ | |
163 | moveq r2, r3 | |
164 | moveq r3, r4 | |
165 | moveq r4, r5 | |
166 | moveq r5, r6 | |
167 | ||
168 | stmmi r12!, {r2-r3} /* n is set, unaligned in the 8s column */ | |
169 | movmi r6, r2 /* Rotate pattern right in r2-r5 by 4-bytes */ | |
170 | movmi r8, r3 | |
171 | movmi r2, r4 | |
172 | movmi r3, r5 | |
173 | movmi r4, r6 | |
174 | movmi r5, r8 | |
175 | ||
176 | mrs r11, cpsr /*copy cpsr in to r11 */ | |
177 | subs r1, r1, r11, lsr #28 | |
178 | ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ | |
511daa4c A |
179 | ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */ |
180 | bxeq lr | |
34e8f829 A |
181 | |
182 | /* By the time we reach here, we are 16-byte aligned and r2-r5 contains */ | |
183 | /* rotated pattern. Now lets make sure we are 32-byte aligned. */ | |
184 | L_16ByteAligned: | |
185 | tst r12, #(1 << 4) | |
186 | stmne r12!, {r2-r5} | |
187 | subsne r1, r1, #16 | |
188 | ||
189 | /* By the time we reach here, data is stored in registers as follows: */ | |
190 | /* r1 << 'len' (remaining length of destination buffer in bytes) */ | |
191 | /* r2-r5 << rotated pattern; either 4x4byte OR 2x8byte OR 1x16-byte */ | |
192 | /* r12 << aligned destination pointer copy (scratch register) */ | |
193 | L_Aligned: | |
194 | cmp r1, #64 | |
195 | blt L_AlignedLessThan64 | |
196 | ||
197 | /* Copy pattern in four more registers so that we can do 64 byte transfers */ | |
198 | mov r6, r2 | |
199 | mov r8, r3 | |
200 | mov r10, r4 | |
201 | mov r11, r5 | |
202 | ||
203 | /* At this point, we are 16-byte aligned and 'len' is greater than 64 bytes */ | |
204 | /* Lets transfer 64 bytes at a time until len becomes less than 64 bytes */ | |
205 | sub r1, r1, #64 /* pre-subtract to avoid extra compare in loop */ | |
206 | L_Loop64: | |
207 | stm r12!, {r2-r6, r8, r10-r11} | |
208 | subs r1, r1, #64 | |
209 | stm r12!, {r2-r6, r8, r10-r11} | |
210 | bge L_Loop64 | |
211 | ||
212 | /* return if 'len' is zero */ | |
213 | adds r1, r1, #64 /* readjust length; previously subtracted extra 64*/ | |
214 | ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ | |
511daa4c A |
215 | ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */ |
216 | bxeq lr | |
34e8f829 A |
217 | |
218 | L_AlignedLessThan64: | |
219 | /* do we have 16 or more bytes left */ | |
220 | cmp r1, #16 | |
221 | stmge r12!, {r2-r5} | |
222 | subsge r1, r1, #16 | |
223 | bgt L_AlignedLessThan64 | |
224 | ldmeq sp!, {r8, r10-r11} /* restores registers from stack */ | |
511daa4c A |
225 | ldmeq sp!, {r4-r7, lr} /* restore & return from subroutine */ |
226 | bxeq lr | |
34e8f829 A |
227 | |
228 | L_AlignedLessThan16: | |
229 | /* store last up-to 15 bytes */ | |
230 | /* move the remaining len bits [3:0] to the flags area of cpsr */ | |
231 | mov r1, r1, lsl #28 | |
232 | msr cpsr_f, r1 | |
233 | ||
234 | stmmi r12!, {r2-r3} /* n is set, store 8 bytes */ | |
235 | movmi r2, r4 /* shift vector down 8 bytes */ | |
236 | movmi r3, r5 | |
237 | ||
238 | streq r2, [r12], #4 /* z is set, store 4 bytes */ | |
239 | moveq r2, r3 /* shift vector down 4 bytes */ | |
240 | ||
241 | strhcs r2, [r12], #2 /* c is set, store 2 bytes */ | |
242 | lsrcs r2, #16 /* shift register right 2 bytes */ | |
243 | ||
244 | strbvs r2, [r12], #1 /* v is set, store 1 byte */ | |
245 | ldm sp!, {r8, r10-r11} /* restores registers from stack */ | |
511daa4c A |
246 | ldm sp!, {r4-r7, lr} /* restore & return from subroutine */ |
247 | bx lr | |
34e8f829 A |
248 | |
249 | /*----------------------------------------------------------------------------*/ | |
250 | /* void memset_pattern8(void *ptr, const void *pattern8, size_t len); */ | |
251 | /* */ | |
252 | /* r0 << destination pointer */ | |
253 | /* r1 << pointer to 8-byte pattern */ | |
254 | /* r2 << 'len' (length of destination buffer in bytes) */ | |
255 | /*----------------------------------------------------------------------------*/ | |
256 | .globl _memset_pattern8 | |
257 | _memset_pattern8: | |
258 | cmp r2, #0 /* check if len is zero */ | |
259 | bxeq lr /* return if length is zero */ | |
260 | ||
261 | /* We need some registers, so save volatiles on stack */ | |
262 | /* Avoid r7 (frame pointer) and r9 (thread register) */ | |
263 | stmfd sp!, {r4-r7, lr} | |
264 | add r7, sp, #12 /* establish frame */ | |
265 | stmfd sp!, {r8, r10-r11} | |
266 | ||
267 | /* copy destination base pointer r0 to r12 and leave r0 alone */ | |
268 | /* so that we return original pointer back to the caller */ | |
269 | mov r12, r0 | |
270 | ||
271 | /* Check if 'len' is long enough to bother alignment of destination */ | |
272 | /* pointer */ | |
273 | cmp r2, #32 /* long enough to bother aligning? */ | |
274 | movlt r3, #8 /* move pattern length into r3 */ | |
275 | movlt r10, #8 /* pattern index */ | |
276 | movlt r11, r1 /* move pattern pointer into r11 */ | |
277 | blt L_Short /* no */ | |
278 | ||
279 | /* move 'len' into r1, get 8-byte pattern in r2-r3 */ | |
280 | mov r6, r2 /* temporarily move 'len' in to r6 */ | |
511daa4c A |
281 | bl L_GetPatternWord /* get unaligned pattern word in r5 */ |
282 | mov r2, r5 /* move pattern word into r2 */ | |
283 | bl L_GetPatternWord | |
284 | mov r3, r5 | |
285 | mov r0, r12 /* r0 was clobbered - restore it */ | |
34e8f829 A |
286 | mov r1, r6 /* move 'len' from r6 to r1 */ |
287 | ||
288 | mov r4, r2 /* copy 8-byte pattern into r4-r5 registers */ | |
289 | mov r5, r3 | |
290 | b L_NotShort /* yes */ | |
291 | ||
292 | ||
293 | /*----------------------------------------------------------------------------*/ | |
294 | /* void memset_pattern16(void *ptr, const void *pattern16, size_t len); */ | |
295 | /* */ | |
296 | /* r0 << destination pointer */ | |
297 | /* r1 << pointer to 16-byte pattern */ | |
298 | /* r2 << 'len' (length of destination buffer in bytes) */ | |
299 | /*----------------------------------------------------------------------------*/ | |
300 | .globl _memset_pattern16 | |
301 | _memset_pattern16: | |
302 | cmp r2, #0 /* check if len is zero */ | |
303 | bxeq lr /* return if length is zero */ | |
304 | ||
305 | /* We need some registers, so save volatiles on stack */ | |
306 | /* Avoid r7 (frame pointer) and r9 (thread register) */ | |
307 | stmfd sp!, {r4-r7, lr} | |
308 | add r7, sp, #12 /* establish frame */ | |
309 | stmfd sp!, {r8, r10-r11} | |
310 | ||
311 | /* copy destination base pointer r0 to r12 and leave r0 alone */ | |
312 | /* so that we return original pointer back to the caller */ | |
313 | mov r12, r0 | |
314 | ||
315 | /* Check if 'len' is long enough to bother alignment of destination */ | |
316 | /* pointer */ | |
317 | cmp r2, #32 /* long enough to bother aligning? */ | |
318 | movlt r3, #16 /* move pattern length into r3 */ | |
319 | movlt r10, #16 /* pattern index */ | |
320 | movlt r11, r1 /* move pattern pointer into r11 */ | |
321 | blt L_Short /* no */ | |
322 | ||
323 | /* move 'len' into r1, get 16-byte pattern in r2-r5 */ | |
324 | mov r6, r2 /* temporarily move 'len' in to r6 */ | |
511daa4c A |
325 | bl L_GetPatternWord /* get unaligned pattern word in r5 */ |
326 | mov r2, r5 /* move pattern word into r2 */ | |
327 | bl L_GetPatternWord | |
328 | mov r3, r5 | |
329 | bl L_GetPatternWord | |
330 | mov r4, r5 | |
331 | bl L_GetPatternWord | |
332 | mov r0, r12 /* r0 was clobbered - restore it */ | |
34e8f829 A |
333 | mov r1, r6 /* move 'len' from r6 to r1 */ |
334 | ||
335 | b L_NotShort /* yes */ | |
336 | ||
337 | ||
511daa4c A |
338 | /*----------------------------------------------------------------------------*/ |
339 | /* Get an unaligned word at r1, returning it in r5. */ | |
340 | /* Increments r1 by 4, clobbers r0. */ | |
341 | /* This is tailored to fit the register usage by the call sites. */ | |
342 | /*----------------------------------------------------------------------------*/ | |
343 | L_GetPatternWord: | |
344 | ldrb r5, [r1], #1 /* get the 1st byte at r1 */ | |
345 | ldrb r0, [r1], #1 /* get the 2nd byte at r1 */ | |
346 | orr r5, r5, r0, lsl #8 /* move into bits 15:8 */ | |
347 | ldrb r0, [r1], #1 /* get the 3rd byte */ | |
348 | orr r5, r5, r0, lsl #16 /* bits 23:16 */ | |
349 | ldrb r0, [r1], #1 /* get the 4th byte */ | |
350 | orr r5, r5, r0, lsl #24 /* bits 31:24 */ | |
351 | bx lr |